The following values were not passed to `accelerate launch` and had defaults used instead:
	`--num_processes` was set to a value of `2`
		More than one GPU was found, enabling multi-GPU training.
		If this was unintended please pass in `--num_processes=1`.
	`--num_machines` was set to a value of `1`
	`--mixed_precision` was set to a value of `'no'`
	`--dynamo_backend` was set to a value of `'no'`
To avoid this warning pass in values for each of the problematic parameters or run `accelerate config`.
[2026-04-17 02:08:45,271] [WARNING] [torchao] Skipping import of cpp extensions due to incompatible torch version. Please upgrade to torch >= 2.11.0 (found 2.10.0+cu128).
[2026-04-17 02:08:45,439] [WARNING] [torchao] Skipping import of cpp extensions due to incompatible torch version. Please upgrade to torch >= 2.11.0 (found 2.10.0+cu128).
[2026-04-17 02:08:47,222] [WARNING] [axolotl.utils.schemas.validation] sample_packing without flash, sdp, xformers, sage, or flex attention does not handle cross sample decontamination.
[2026-04-17 02:08:47,223] [INFO] [axolotl.utils.schemas.validation] Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing
[2026-04-17 02:08:47,223] [WARNING] [axolotl.utils.schemas.validation] Configuring FSDP fields with the `fsdp_` prefix is deprecated. Please omit the `fsdp_` prefix from the any fields in `fsdp_config`.
[2026-04-17 02:08:47,467] [INFO] [axolotl.cli.config] config:
{
  "activation_offloading": false,
  "axolotl_config_path": "/workspace/data/sage-classifier-train-scripts/qwen3/fft/qwen3-4B-train-v1-6-no-liger-flex-magnifi-module-classifier-04-17-relabelled-upsampled.yml",
  "base_model": "Tifin-Sage/magnifi-classifier-01-05-search-agent-3-epochs-3k-unknown-errors",
  "base_model_config": "Tifin-Sage/magnifi-classifier-01-05-search-agent-3-epochs-3k-unknown-errors",
  "batch_size": 2,
  "bf16": true,
  "capabilities": {
    "bf16": true,
    "compute_capability": "sm_80",
    "fp8": false,
    "n_gpu": 2,
    "n_node": 1,
    "tf32": true
  },
  "chat_template": "qwen3",
  "context_parallel_size": 1,
  "dataloader_num_workers": 2,
  "dataloader_pin_memory": true,
  "dataloader_prefetch_factor": 256,
  "dataset_num_proc": 128,
  "dataset_prepared_path": "/workspace/data/datasets_prepared/magnifi-module-classifier-04-17-relabelled-upsampled",
  "datasets": [
    {
      "chat_template": "tokenizer_default",
      "field_messages": "messages",
      "message_property_mappings": {
        "content": "content",
        "role": "role"
      },
      "path": "Tifin-Sage/magnifi-module-classifier-04-17-relabelled-upsampled",
      "split": "train",
      "trust_remote_code": false,
      "type": "chat_template"
    }
  ],
  "ddp": true,
  "device": "cuda:0",
  "device_map": {
    "": 0
  },
  "dion_rank_fraction": 1.0,
  "dion_rank_multiple_of": 1,
  "eaft_alpha": 1.0,
  "eaft_k": 20,
  "env_capabilities": {
    "torch_version": "2.10.0"
  },
  "eval_batch_size": 1,
  "eval_causal_lm_metrics": [
    "sacrebleu",
    "comet",
    "ter",
    "chrf"
  ],
  "eval_max_new_tokens": 128,
  "eval_sample_packing": true,
  "eval_steps": 0.25,
  "eval_table_size": 0,
  "evals_per_epoch": 2,
  "experimental_skip_move_to_device": true,
  "fp16": false,
  "fsdp": [
    "full_shard",
    "auto_wrap"
  ],
  "fsdp_config": {
    "activation_checkpointing": true,
    "auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
    "cpu_ram_efficient_loading": true,
    "fsdp_version": 2,
    "offload_params": false,
    "reshard_after_forward": true,
    "state_dict_type": "FULL_STATE_DICT",
    "transformer_layer_cls_to_wrap": "Qwen3DecoderLayer"
  },
  "fsdp_version": 2,
  "generate_samples": false,
  "generation_do_sample": true,
  "generation_max_new_tokens": 50,
  "generation_prompt_ratio": 0.5,
  "generation_temperature": 0.7,
  "gradient_accumulation_steps": 1,
  "gradient_checkpointing": false,
  "hub_model_id": "Tifin-Sage/magnifi-module-classifier-04-17-relabelled-upsampled",
  "include_tkps": true,
  "layer_offloading": false,
  "learning_rate": 2e-05,
  "lisa_layers_attribute": "model.layers",
  "load_best_model_at_end": false,
  "load_in_4bit": false,
  "load_in_8bit": false,
  "local_rank": 0,
  "logging_steps": 1,
  "lora_dropout": 0.0,
  "loraplus_lr_embedding": 1e-06,
  "lr_scheduler": "cosine",
  "mean_resizing_embeddings": false,
  "merge_method": "memory_efficient",
  "micro_batch_size": 1,
  "model_config_type": "qwen3",
  "num_epochs": 2.0,
  "num_generation_samples": 3,
  "optimizer": "adamw_torch_fused",
  "otel_metrics_host": "localhost",
  "otel_metrics_port": 8000,
  "output_dir": "/workspace/data/outputs/qwen3-4B/fft_magnifi-module-classifier-04-17-relabelled-upsampled/",
  "pad_to_sequence_len": true,
  "pretrain_multipack_attn": true,
  "profiler_steps_start": 0,
  "qlora_sharded_model_loading": false,
  "quantize_moe_experts": false,
  "ray_num_workers": 1,
  "resources_per_worker": {
    "GPU": 1
  },
  "sample_packing": true,
  "sample_packing_bin_size": 200,
  "sample_packing_group_size": 100000,
  "save_only_model": false,
  "save_safetensors": true,
  "save_steps": 0.5,
  "saves_per_epoch": 1,
  "sequence_len": 16000,
  "shuffle_before_merging_datasets": false,
  "shuffle_merged_datasets": true,
  "skip_prepare_dataset": false,
  "streaming_multipack_buffer_size": 10000,
  "strict": false,
  "tensor_parallel_size": 1,
  "tf32": true,
  "tiled_mlp_use_original_mlp": true,
  "tokenizer_config": "Tifin-Sage/magnifi-classifier-01-05-search-agent-3-epochs-3k-unknown-errors",
  "tokenizer_save_jinja_files": true,
  "torch_dtype": "torch.bfloat16",
  "train_on_inputs": false,
  "trl": {
    "async_prefetch": false,
    "log_completions": false,
    "mask_truncated_completions": false,
    "ref_model_mixup_alpha": 0.9,
    "ref_model_sync_steps": 64,
    "replay_buffer_size": 0,
    "replay_recompute_logps": true,
    "reroll_max_groups": 1,
    "reroll_start_fraction": 1.0,
    "reward_num_workers": 1,
    "scale_rewards": true,
    "skip_zero_advantage_batches": true,
    "sync_ref_model": false,
    "use_data_producer": false,
    "use_vllm": false,
    "vllm_lora_sync": false,
    "vllm_server_host": "0.0.0.0",
    "vllm_server_port": 8000
  },
  "use_otel_metrics": false,
  "use_ray": false,
  "use_wandb": true,
  "val_set_size": 0.1,
  "vllm": {
    "device": "auto",
    "dtype": "auto",
    "gpu_memory_utilization": 0.9,
    "host": "0.0.0.0",
    "port": 8000
  },
  "wandb_name": "magnifi-module-classifier-04-17-relabelled-upsampled",
  "wandb_project": "sage-classifier",
  "warmup_ratio": 0.1,
  "weight_decay": 0.0,
  "world_size": 2
}
[2026-04-17 02:08:51,607] [INFO] [axolotl.utils.data.shared] Loading prepared dataset from disk at /workspace/data/datasets_prepared/magnifi-module-classifier-04-17-relabelled-upsampled/6241b9d0f4bdccc4ed4f52e5adefd1bc...
[Gloo] Rank [Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1
0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1
[2026-04-17 02:08:57,019] [INFO] [axolotl.utils.samplers.multipack] gather_len_batches: [54, 54]
[2026-04-17 02:08:57,129] [INFO] [axolotl.utils.trainer] sample_packing_eff_est across ranks: [0.9649779796600342, 0.9649779796600342]
[2026-04-17 02:09:01,870] [INFO] [axolotl.utils.samplers.multipack] gather_len_batches: [478, 478]
[2026-04-17 02:09:01,872] [INFO] [axolotl.utils.trainer] sample_packing_eff_est across ranks: [0.969444751739502, 0.9775572419166565]
[2026-04-17 02:09:01,874] [INFO] [axolotl.utils.data.sft] Maximum number of steps set at 478
[2026-04-17 02:09:03,028] [INFO] [axolotl.loaders.patch_manager] Applying multipack dataloader patch for sample packing...
Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]Fetching 2 files:  50%|█████     | 1/2 [00:05<00:05,  5.40s/it]Fetching 2 files: 100%|██████████| 2/2 [00:05<00:00,  2.70s/it]
Fetching 2 files:  50%|█████     | 1/2 [00:05<00:05,  5.42s/it]Fetching 2 files: 100%|██████████| 2/2 [00:05<00:00,  2.71s/it]
Loading weights:   0%|          | 0/399 [00:00<?, ?it/s]Loading weights:  76%|███████▌  | 303/399 [00:00<00:00, 3028.93it/s]Loading weights: 100%|██████████| 399/399 [00:00<00:00, 2952.58it/s]
Loading weights:   0%|          | 0/399 [00:00<?, ?it/s]Loading weights:  83%|████████▎ | 331/399 [00:00<00:00, 3306.31it/s]Loading weights: 100%|██████████| 399/399 [00:00<00:00, 3610.06it/s]
[2026-04-17 02:09:10,437] [INFO] [axolotl.loaders.model] Converting modules to torch.bfloat16
[2026-04-17 02:09:12,900] [WARNING] [accelerate.utils.dataclasses] sync_module_states is obsolete in FSDP2, as it is not needed anymore.Setting sync_module_states to None.
[2026-04-17 02:09:13,195] [WARNING] [accelerate.utils.dataclasses] sync_module_states is obsolete in FSDP2, as it is not needed anymore.Setting sync_module_states to None.
[2026-04-17 02:09:13,778] [INFO] [axolotl.train] Pre-saving tokenizer to /workspace/data/outputs/qwen3-4B/fft_magnifi-module-classifier-04-17-relabelled-upsampled/...
[2026-04-17 02:09:13,930] [INFO] [axolotl.train] Pre-saving model config to /workspace/data/outputs/qwen3-4B/fft_magnifi-module-classifier-04-17-relabelled-upsampled/...
[2026-04-17 02:09:13,980] [INFO] [axolotl.train] Starting trainer...
[2026-04-17 02:09:19,915] [INFO] [axolotl.utils.samplers.multipack] gather_len_batches: [480, 480]
[2026-04-17 02:09:20,072] [INFO] [axolotl.monkeypatch.accelerate.fsdp2] Broadcasting full state dict to all ranks...
wandb: [wandb.login()] Loaded credentials for https://api.wandb.ai from WANDB_API_KEY.
wandb: Currently logged in as: subhanandh-t (subhanandh-t-tifin) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
wandb: setting up run 4dxwgvhh
wandb: Tracking run with wandb version 0.26.0
wandb: Run data is saved locally in /workspace/wandb/run-20260417_020921-4dxwgvhh
wandb: Run `wandb offline` to turn off syncing.
wandb: Syncing run magnifi-module-classifier-04-17-relabelled-upsampled
wandb: ⭐️ View project at https://wandb.ai/subhanandh-t-tifin/sage-classifier
wandb: 🚀 View run at https://wandb.ai/subhanandh-t-tifin/sage-classifier/runs/4dxwgvhh
wandb: WARNING Saving files without folders. If you want to preserve subdirectories pass base_path to wandb.save, i.e. wandb.save("/mnt/folder/file.h5", base_path="/mnt")
wandb: WARNING Symlinked 1 file into the W&B run directory; call wandb.save again to sync new files.
[2026-04-17 02:09:24,132] [INFO] [axolotl.utils.callbacks] The Axolotl config has been saved to the WandB run under files.
  0%|          | 0/478 [00:00<?, ?it/s][2026-04-17 02:09:24,137] [INFO] [axolotl.core.trainers.base] Running evaluation step...
[2026-04-17 02:09:29,462] [INFO] [axolotl.utils.samplers.multipack] gather_len_batches: [54, 54]

  0%|          | 0/27 [00:00<?, ?it/s]
  7%|▋         | 2/27 [00:01<00:18,  1.37it/s]
 11%|█         | 3/27 [00:04<00:37,  1.57s/it]
 15%|█▍        | 4/27 [00:06<00:46,  2.01s/it]
 19%|█▊        | 5/27 [00:09<00:49,  2.27s/it]
 22%|██▏       | 6/27 [00:12<00:51,  2.43s/it]
 26%|██▌       | 7/27 [00:15<00:50,  2.54s/it]
 30%|██▉       | 8/27 [00:17<00:49,  2.60s/it]
 33%|███▎      | 9/27 [00:20<00:47,  2.65s/it]
 37%|███▋      | 10/27 [00:23<00:45,  2.68s/it]
 41%|████      | 11/27 [00:26<00:43,  2.70s/it]
 44%|████▍     | 12/27 [00:28<00:39,  2.63s/it]
 48%|████▊     | 13/27 [00:31<00:38,  2.76s/it]
 52%|█████▏    | 14/27 [00:34<00:35,  2.77s/it]
 56%|█████▌    | 15/27 [00:37<00:33,  2.76s/it]
 59%|█████▉    | 16/27 [00:40<00:30,  2.76s/it]
 63%|██████▎   | 17/27 [00:42<00:27,  2.76s/it]
 67%|██████▋   | 18/27 [00:45<00:24,  2.76s/it]
 70%|███████   | 19/27 [00:48<00:22,  2.76s/it]
 74%|███████▍  | 20/27 [00:51<00:19,  2.77s/it]
 78%|███████▊  | 21/27 [00:53<00:16,  2.68s/it]
 81%|████████▏ | 22/27 [00:56<00:13,  2.79s/it]
 85%|████████▌ | 23/27 [00:59<00:11,  2.78s/it]
 89%|████████▉ | 24/27 [01:02<00:08,  2.77s/it]
 93%|█████████▎| 25/27 [01:04<00:05,  2.77s/it]
 96%|█████████▋| 26/27 [01:07<00:02,  2.76s/it]
100%|██████████| 27/27 [01:10<00:00,  2.79s/it]                                       
                                               {'eval_loss': '0.2049', 'eval_runtime': '75.78', 'eval_samples_per_second': '2.758', 'eval_steps_per_second': '1.386', 'eval_ppl': '1.227', 'memory/max_active (GiB)': '27.41', 'memory/max_allocated (GiB)': '27.41', 'memory/device_reserved (GiB)': '30.62', 'epoch': 0}
  0%|          | 0/478 [01:21<?, ?it/s]
100%|██████████| 27/27 [01:11<00:00,  2.79s/it]
                                                 0%|          | 1/478 [01:37<12:54:11, 97.38s/it]                                                  {'loss': '0.1978', 'grad_norm': '6.188', 'learning_rate': '0', 'ppl': '1.219', 'memory/max_active (GiB)': '37.9', 'memory/max_allocated (GiB)': '37.9', 'memory/device_reserved (GiB)': '49.99', 'tokens/train_per_sec_per_gpu': '22.08', 'tokens/total': 32000, 'tokens/trainable': 687, 'epoch': '0.004167'}
  0%|          | 1/478 [01:37<12:54:11, 97.38s/it]  0%|          | 2/478 [01:52<6:27:59, 48.91s/it]                                                  {'loss': '0.2119', 'grad_norm': '6.781', 'learning_rate': '4.255e-07', 'ppl': '1.236', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.84', 'tokens/total': 64000, 'tokens/trainable': 1310, 'epoch': '0.008333'}
  0%|          | 2/478 [01:52<6:27:59, 48.91s/it]  1%|          | 3/478 [02:07<4:24:28, 33.41s/it]                                                 {'loss': '0.1929', 'grad_norm': '7.031', 'learning_rate': '8.511e-07', 'ppl': '1.213', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '18.47', 'tokens/total': 96000, 'tokens/trainable': 1862, 'epoch': '0.0125'}
  1%|          | 3/478 [02:07<4:24:28, 33.41s/it]  1%|          | 4/478 [02:22<3:26:25, 26.13s/it]                                                 {'loss': '0.1885', 'grad_norm': '8.938', 'learning_rate': '1.277e-06', 'ppl': '1.207', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '18.06', 'tokens/total': 128000, 'tokens/trainable': 2402, 'epoch': '0.01667'}
  1%|          | 4/478 [02:22<3:26:25, 26.13s/it]  1%|          | 5/478 [02:37<2:54:16, 22.11s/it]                                                 {'loss': '0.1914', 'grad_norm': '6.75', 'learning_rate': '1.702e-06', 'ppl': '1.211', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '18.36', 'tokens/total': 160000, 'tokens/trainable': 2951, 'epoch': '0.02083'}
  1%|          | 5/478 [02:37<2:54:16, 22.11s/it]  1%|▏         | 6/478 [02:52<2:34:49, 19.68s/it]                                                 {'loss': '0.1206', 'grad_norm': '4.875', 'learning_rate': '2.128e-06', 'ppl': '1.128', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.83', 'tokens/total': 192000, 'tokens/trainable': 3574, 'epoch': '0.025'}
  1%|▏         | 6/478 [02:52<2:34:49, 19.68s/it]  1%|▏         | 7/478 [03:07<2:22:25, 18.14s/it]                                                 {'loss': '0.2275', 'grad_norm': '9', 'learning_rate': '2.553e-06', 'ppl': '1.256', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '19.33', 'tokens/total': 224000, 'tokens/trainable': 4152, 'epoch': '0.02917'}
  1%|▏         | 7/478 [03:07<2:22:25, 18.14s/it]  2%|▏         | 8/478 [03:22<2:14:12, 17.13s/it]                                                 {'loss': '0.2021', 'grad_norm': '7.312', 'learning_rate': '2.979e-06', 'ppl': '1.224', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '19.73', 'tokens/total': 256000, 'tokens/trainable': 4742, 'epoch': '0.03333'}
  2%|▏         | 8/478 [03:22<2:14:12, 17.13s/it]  2%|▏         | 9/478 [03:37<2:08:38, 16.46s/it]                                                 {'loss': '0.1592', 'grad_norm': '4.969', 'learning_rate': '3.404e-06', 'ppl': '1.173', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.34', 'tokens/total': 288000, 'tokens/trainable': 5410, 'epoch': '0.0375'}
  2%|▏         | 9/478 [03:37<2:08:38, 16.46s/it]  2%|▏         | 10/478 [03:52<2:04:46, 16.00s/it]                                                  {'loss': '0.1953', 'grad_norm': '5.938', 'learning_rate': '3.83e-06', 'ppl': '1.216', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.77', 'tokens/total': 320000, 'tokens/trainable': 6031, 'epoch': '0.04167'}
  2%|▏         | 10/478 [03:52<2:04:46, 16.00s/it]  2%|▏         | 11/478 [04:07<2:02:05, 15.69s/it]                                                  {'loss': '0.1826', 'grad_norm': '5.5', 'learning_rate': '4.255e-06', 'ppl': '1.2', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '19.52', 'tokens/total': 352000, 'tokens/trainable': 6615, 'epoch': '0.04583'}
  2%|▏         | 11/478 [04:07<2:02:05, 15.69s/it]  3%|▎         | 12/478 [04:22<2:00:07, 15.47s/it]                                                  {'loss': '0.2266', 'grad_norm': '6.594', 'learning_rate': '4.681e-06', 'ppl': '1.254', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.01', 'tokens/total': 384000, 'tokens/trainable': 7213, 'epoch': '0.05'}
  3%|▎         | 12/478 [04:22<2:00:07, 15.47s/it]  3%|▎         | 13/478 [04:37<1:58:41, 15.32s/it]                                                  {'loss': '0.248', 'grad_norm': '7.031', 'learning_rate': '5.106e-06', 'ppl': '1.282', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '18.44', 'tokens/total': 416000, 'tokens/trainable': 7764, 'epoch': '0.05417'}
  3%|▎         | 13/478 [04:37<1:58:41, 15.32s/it]  3%|▎         | 14/478 [04:52<1:57:38, 15.21s/it]                                                  {'loss': '0.1992', 'grad_norm': '6.344', 'learning_rate': '5.532e-06', 'ppl': '1.22', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '18.93', 'tokens/total': 448000, 'tokens/trainable': 8330, 'epoch': '0.05833'}
  3%|▎         | 14/478 [04:52<1:57:38, 15.21s/it]  3%|▎         | 15/478 [05:06<1:56:49, 15.14s/it]                                                  {'loss': '0.1826', 'grad_norm': '7.469', 'learning_rate': '5.957e-06', 'ppl': '1.2', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.44', 'tokens/total': 480000, 'tokens/trainable': 9001, 'epoch': '0.0625'}
  3%|▎         | 15/478 [05:07<1:56:49, 15.14s/it]  3%|▎         | 16/478 [05:21<1:56:10, 15.09s/it]                                                  {'loss': '0.1689', 'grad_norm': '5.688', 'learning_rate': '6.383e-06', 'ppl': '1.184', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '21.78', 'tokens/total': 512000, 'tokens/trainable': 9652, 'epoch': '0.06667'}
  3%|▎         | 16/478 [05:21<1:56:10, 15.09s/it]  4%|▎         | 17/478 [05:36<1:55:34, 15.04s/it]                                                  {'loss': '0.1113', 'grad_norm': '6.594', 'learning_rate': '6.809e-06', 'ppl': '1.118', 'memory/max_active (GiB)': '45.17', 'memory/max_allocated (GiB)': '45.17', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '23.16', 'tokens/total': 544000, 'tokens/trainable': 10343, 'epoch': '0.07083'}
  4%|▎         | 17/478 [05:36<1:55:34, 15.04s/it]  4%|▍         | 18/478 [05:51<1:55:09, 15.02s/it]                                                  {'loss': '0.1919', 'grad_norm': '6.281', 'learning_rate': '7.234e-06', 'ppl': '1.212', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.71', 'tokens/total': 576000, 'tokens/trainable': 11022, 'epoch': '0.075'}
  4%|▍         | 18/478 [05:51<1:55:09, 15.02s/it]  4%|▍         | 19/478 [06:06<1:54:47, 15.00s/it]                                                  {'loss': '0.2617', 'grad_norm': '7.594', 'learning_rate': '7.66e-06', 'ppl': '1.299', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '18.8', 'tokens/total': 608000, 'tokens/trainable': 11584, 'epoch': '0.07917'}
  4%|▍         | 19/478 [06:06<1:54:47, 15.00s/it]  4%|▍         | 20/478 [06:21<1:54:27, 14.99s/it]                                                  {'loss': '0.1699', 'grad_norm': '6.531', 'learning_rate': '8.085e-06', 'ppl': '1.185', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '21.31', 'tokens/total': 640000, 'tokens/trainable': 12221, 'epoch': '0.08333'}
  4%|▍         | 20/478 [06:21<1:54:27, 14.99s/it]  4%|▍         | 21/478 [06:36<1:54:08, 14.99s/it]                                                  {'loss': '0.1992', 'grad_norm': '7.25', 'learning_rate': '8.511e-06', 'ppl': '1.22', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.05', 'tokens/total': 672000, 'tokens/trainable': 12880, 'epoch': '0.0875'}
  4%|▍         | 21/478 [06:36<1:54:08, 14.99s/it]  5%|▍         | 22/478 [06:51<1:53:51, 14.98s/it]                                                  {'loss': '0.2231', 'grad_norm': '6.562', 'learning_rate': '8.936e-06', 'ppl': '1.25', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.91', 'tokens/total': 704000, 'tokens/trainable': 13505, 'epoch': '0.09167'}
  5%|▍         | 22/478 [06:51<1:53:51, 14.98s/it]  5%|▍         | 23/478 [07:06<1:53:32, 14.97s/it]                                                  {'loss': '0.2583', 'grad_norm': '8.5', 'learning_rate': '9.362e-06', 'ppl': '1.295', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.84', 'tokens/total': 736000, 'tokens/trainable': 14187, 'epoch': '0.09583'}
  5%|▍         | 23/478 [07:06<1:53:32, 14.97s/it]  5%|▌         | 24/478 [07:21<1:53:16, 14.97s/it]                                                  {'loss': '0.1807', 'grad_norm': '7.062', 'learning_rate': '9.787e-06', 'ppl': '1.198', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '16.49', 'tokens/total': 768000, 'tokens/trainable': 14680, 'epoch': '0.1'}
  5%|▌         | 24/478 [07:21<1:53:16, 14.97s/it]  5%|▌         | 25/478 [07:36<1:53:01, 14.97s/it]                                                  {'loss': '0.2788', 'grad_norm': '6.625', 'learning_rate': '1.021e-05', 'ppl': '1.322', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '21.58', 'tokens/total': 800000, 'tokens/trainable': 15325, 'epoch': '0.1042'}
  5%|▌         | 25/478 [07:36<1:53:01, 14.97s/it]  5%|▌         | 26/478 [07:51<1:52:46, 14.97s/it]                                                  {'loss': '0.1494', 'grad_norm': '4.75', 'learning_rate': '1.064e-05', 'ppl': '1.161', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '19.33', 'tokens/total': 832000, 'tokens/trainable': 15903, 'epoch': '0.1083'}
  5%|▌         | 26/478 [07:51<1:52:46, 14.97s/it]  6%|▌         | 27/478 [08:06<1:52:31, 14.97s/it]                                                  {'loss': '0.1714', 'grad_norm': '5.812', 'learning_rate': '1.106e-05', 'ppl': '1.187', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '21.14', 'tokens/total': 864000, 'tokens/trainable': 16535, 'epoch': '0.1125'}
  6%|▌         | 27/478 [08:06<1:52:31, 14.97s/it]  6%|▌         | 28/478 [08:21<1:52:13, 14.96s/it]                                                  {'loss': '0.2041', 'grad_norm': '6.062', 'learning_rate': '1.149e-05', 'ppl': '1.226', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.99', 'tokens/total': 896000, 'tokens/trainable': 17221, 'epoch': '0.1167'}
  6%|▌         | 28/478 [08:21<1:52:13, 14.96s/it]  6%|▌         | 29/478 [08:36<1:51:59, 14.96s/it]                                                  {'loss': '0.1694', 'grad_norm': '4.969', 'learning_rate': '1.191e-05', 'ppl': '1.185', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.54', 'tokens/total': 928000, 'tokens/trainable': 17895, 'epoch': '0.1208'}
  6%|▌         | 29/478 [08:36<1:51:59, 14.96s/it]  6%|▋         | 30/478 [08:51<1:51:45, 14.97s/it]                                                  {'loss': '0.1841', 'grad_norm': '5.438', 'learning_rate': '1.234e-05', 'ppl': '1.202', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '24.18', 'tokens/total': 960000, 'tokens/trainable': 18618, 'epoch': '0.125'}
  6%|▋         | 30/478 [08:51<1:51:45, 14.97s/it]  6%|▋         | 31/478 [09:06<1:51:30, 14.97s/it]                                                  {'loss': '0.1812', 'grad_norm': '5.281', 'learning_rate': '1.277e-05', 'ppl': '1.199', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '21.14', 'tokens/total': 992000, 'tokens/trainable': 19250, 'epoch': '0.1292'}
  6%|▋         | 31/478 [09:06<1:51:30, 14.97s/it]  7%|▋         | 32/478 [09:21<1:51:16, 14.97s/it]                                                  {'loss': '0.2358', 'grad_norm': '5.875', 'learning_rate': '1.319e-05', 'ppl': '1.266', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.54', 'tokens/total': 1024000, 'tokens/trainable': 19924, 'epoch': '0.1333'}
  7%|▋         | 32/478 [09:21<1:51:16, 14.97s/it]  7%|▋         | 33/478 [09:36<1:51:01, 14.97s/it]                                                  {'loss': '0.1631', 'grad_norm': '5.312', 'learning_rate': '1.362e-05', 'ppl': '1.177', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.76', 'tokens/total': 1056000, 'tokens/trainable': 20545, 'epoch': '0.1375'}
  7%|▋         | 33/478 [09:36<1:51:01, 14.97s/it]  7%|▋         | 34/478 [09:51<1:50:46, 14.97s/it]                                                  {'loss': '0.269', 'grad_norm': '6.281', 'learning_rate': '1.404e-05', 'ppl': '1.309', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.13', 'tokens/total': 1088000, 'tokens/trainable': 21147, 'epoch': '0.1417'}
  7%|▋         | 34/478 [09:51<1:50:46, 14.97s/it]  7%|▋         | 35/478 [10:06<1:50:32, 14.97s/it]                                                  {'loss': '0.2339', 'grad_norm': '7', 'learning_rate': '1.447e-05', 'ppl': '1.264', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '23.01', 'tokens/total': 1120000, 'tokens/trainable': 21835, 'epoch': '0.1458'}
  7%|▋         | 35/478 [10:06<1:50:32, 14.97s/it]  8%|▊         | 36/478 [10:21<1:50:17, 14.97s/it]                                                  {'loss': '0.1953', 'grad_norm': '5.5', 'learning_rate': '1.489e-05', 'ppl': '1.216', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.4', 'tokens/total': 1152000, 'tokens/trainable': 22505, 'epoch': '0.15'}
  8%|▊         | 36/478 [10:21<1:50:17, 14.97s/it]  8%|▊         | 37/478 [10:36<1:50:02, 14.97s/it]                                                  {'loss': '0.1743', 'grad_norm': '5.344', 'learning_rate': '1.532e-05', 'ppl': '1.19', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '19.26', 'tokens/total': 1184000, 'tokens/trainable': 23081, 'epoch': '0.1542'}
  8%|▊         | 37/478 [10:36<1:50:02, 14.97s/it]  8%|▊         | 38/478 [10:51<1:49:51, 14.98s/it]                                                  {'loss': '0.2637', 'grad_norm': '6.031', 'learning_rate': '1.574e-05', 'ppl': '1.302', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.96', 'tokens/total': 1216000, 'tokens/trainable': 23769, 'epoch': '0.1583'}
  8%|▊         | 38/478 [10:51<1:49:51, 14.98s/it]  8%|▊         | 39/478 [11:06<1:49:32, 14.97s/it]                                                  {'loss': '0.1641', 'grad_norm': '4.688', 'learning_rate': '1.617e-05', 'ppl': '1.178', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '24.92', 'tokens/total': 1248000, 'tokens/trainable': 24513, 'epoch': '0.1625'}
  8%|▊         | 39/478 [11:06<1:49:32, 14.97s/it]  8%|▊         | 40/478 [11:21<1:49:17, 14.97s/it]                                                  {'loss': '0.1709', 'grad_norm': '4.938', 'learning_rate': '1.66e-05', 'ppl': '1.186', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.32', 'tokens/total': 1280000, 'tokens/trainable': 25121, 'epoch': '0.1667'}
  8%|▊         | 40/478 [11:21<1:49:17, 14.97s/it]  9%|▊         | 41/478 [11:36<1:49:02, 14.97s/it]                                                  {'loss': '0.1924', 'grad_norm': '5.562', 'learning_rate': '1.702e-05', 'ppl': '1.212', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '21.4', 'tokens/total': 1312000, 'tokens/trainable': 25761, 'epoch': '0.1708'}
  9%|▊         | 41/478 [11:36<1:49:02, 14.97s/it]  9%|▉         | 42/478 [11:51<1:48:46, 14.97s/it]                                                  {'loss': '0.1841', 'grad_norm': '4.969', 'learning_rate': '1.745e-05', 'ppl': '1.202', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '24.32', 'tokens/total': 1344000, 'tokens/trainable': 26488, 'epoch': '0.175'}
  9%|▉         | 42/478 [11:51<1:48:46, 14.97s/it]  9%|▉         | 43/478 [12:06<1:48:28, 14.96s/it]                                                  {'loss': '0.2368', 'grad_norm': '5.562', 'learning_rate': '1.787e-05', 'ppl': '1.267', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.31', 'tokens/total': 1376000, 'tokens/trainable': 27154, 'epoch': '0.1792'}
  9%|▉         | 43/478 [12:06<1:48:28, 14.96s/it]  9%|▉         | 44/478 [12:21<1:48:14, 14.96s/it]                                                  {'loss': '0.2148', 'grad_norm': '5.625', 'learning_rate': '1.83e-05', 'ppl': '1.24', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '21.67', 'tokens/total': 1408000, 'tokens/trainable': 27802, 'epoch': '0.1833'}
  9%|▉         | 44/478 [12:21<1:48:14, 14.96s/it]  9%|▉         | 45/478 [12:35<1:48:00, 14.97s/it]                                                  {'loss': '0.228', 'grad_norm': '5.906', 'learning_rate': '1.872e-05', 'ppl': '1.256', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.5', 'tokens/total': 1440000, 'tokens/trainable': 28475, 'epoch': '0.1875'}
  9%|▉         | 45/478 [12:36<1:48:00, 14.97s/it] 10%|▉         | 46/478 [12:50<1:47:45, 14.97s/it]                                                  {'loss': '0.1973', 'grad_norm': '5.75', 'learning_rate': '1.915e-05', 'ppl': '1.218', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.88', 'tokens/total': 1472000, 'tokens/trainable': 29099, 'epoch': '0.1917'}
 10%|▉         | 46/478 [12:50<1:47:45, 14.97s/it] 10%|▉         | 47/478 [13:05<1:47:29, 14.97s/it]                                                  {'loss': '0.1963', 'grad_norm': '5.312', 'learning_rate': '1.957e-05', 'ppl': '1.217', 'memory/max_active (GiB)': '45.17', 'memory/max_allocated (GiB)': '45.17', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.48', 'tokens/total': 1504000, 'tokens/trainable': 29771, 'epoch': '0.1958'}
 10%|▉         | 47/478 [13:05<1:47:29, 14.97s/it] 10%|█         | 48/478 [13:20<1:47:15, 14.97s/it]                                                  {'loss': '0.2627', 'grad_norm': '7.156', 'learning_rate': '2e-05', 'ppl': '1.3', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '18.63', 'tokens/total': 1536000, 'tokens/trainable': 30328, 'epoch': '0.2'}
 10%|█         | 48/478 [13:20<1:47:15, 14.97s/it] 10%|█         | 49/478 [13:35<1:47:00, 14.97s/it]                                                  {'loss': '0.2505', 'grad_norm': '7.438', 'learning_rate': '2e-05', 'ppl': '1.285', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.57', 'tokens/total': 1568000, 'tokens/trainable': 30943, 'epoch': '0.2042'}
 10%|█         | 49/478 [13:35<1:47:00, 14.97s/it] 10%|█         | 50/478 [13:50<1:46:44, 14.96s/it]                                                  {'loss': '0.2026', 'grad_norm': '6.75', 'learning_rate': '2e-05', 'ppl': '1.225', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '18.94', 'tokens/total': 1600000, 'tokens/trainable': 31509, 'epoch': '0.2083'}
 10%|█         | 50/478 [13:50<1:46:44, 14.96s/it] 11%|█         | 51/478 [14:05<1:46:29, 14.96s/it]                                                  {'loss': '0.2368', 'grad_norm': '6.281', 'learning_rate': '2e-05', 'ppl': '1.267', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '18.24', 'tokens/total': 1632000, 'tokens/trainable': 32054, 'epoch': '0.2125'}
 11%|█         | 51/478 [14:05<1:46:29, 14.96s/it] 11%|█         | 52/478 [14:20<1:46:06, 14.94s/it]                                                  {'loss': '0.2344', 'grad_norm': '6.281', 'learning_rate': '2e-05', 'ppl': '1.264', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '19.65', 'tokens/total': 1664000, 'tokens/trainable': 32639, 'epoch': '0.2167'}
 11%|█         | 52/478 [14:20<1:46:06, 14.94s/it] 11%|█         | 53/478 [14:35<1:45:54, 14.95s/it]                                                  {'loss': '0.2012', 'grad_norm': '5.188', 'learning_rate': '1.999e-05', 'ppl': '1.223', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '21.51', 'tokens/total': 1696000, 'tokens/trainable': 33282, 'epoch': '0.2208'}
 11%|█         | 53/478 [14:35<1:45:54, 14.95s/it] 11%|█▏        | 54/478 [14:50<1:45:37, 14.95s/it]                                                  {'loss': '0.2617', 'grad_norm': '5.656', 'learning_rate': '1.999e-05', 'ppl': '1.299', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '21.75', 'tokens/total': 1728000, 'tokens/trainable': 33931, 'epoch': '0.225'}
 11%|█▏        | 54/478 [14:50<1:45:37, 14.95s/it] 12%|█▏        | 55/478 [15:05<1:45:22, 14.95s/it]                                                  {'loss': '0.207', 'grad_norm': '5.312', 'learning_rate': '1.999e-05', 'ppl': '1.23', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '23.32', 'tokens/total': 1760000, 'tokens/trainable': 34627, 'epoch': '0.2292'}
 12%|█▏        | 55/478 [15:05<1:45:22, 14.95s/it] 12%|█▏        | 56/478 [15:20<1:45:09, 14.95s/it]                                                  {'loss': '0.1846', 'grad_norm': '5.812', 'learning_rate': '1.998e-05', 'ppl': '1.203', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '18.67', 'tokens/total': 1792000, 'tokens/trainable': 35185, 'epoch': '0.2333'}
 12%|█▏        | 56/478 [15:20<1:45:09, 14.95s/it] 12%|█▏        | 57/478 [15:35<1:44:53, 14.95s/it]                                                  {'loss': '0.207', 'grad_norm': '5.594', 'learning_rate': '1.998e-05', 'ppl': '1.23', 'memory/max_active (GiB)': '45.17', 'memory/max_allocated (GiB)': '45.17', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.99', 'tokens/total': 1824000, 'tokens/trainable': 35871, 'epoch': '0.2375'}
 12%|█▏        | 57/478 [15:35<1:44:53, 14.95s/it] 12%|█▏        | 58/478 [15:50<1:44:40, 14.95s/it]                                                  {'loss': '0.2261', 'grad_norm': '6', 'learning_rate': '1.997e-05', 'ppl': '1.254', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '18.53', 'tokens/total': 1856000, 'tokens/trainable': 36425, 'epoch': '0.2417'}
 12%|█▏        | 58/478 [15:50<1:44:40, 14.95s/it] 12%|█▏        | 59/478 [16:05<1:44:27, 14.96s/it]                                                  {'loss': '0.1909', 'grad_norm': '6.75', 'learning_rate': '1.997e-05', 'ppl': '1.21', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.18', 'tokens/total': 1888000, 'tokens/trainable': 37028, 'epoch': '0.2458'}
 12%|█▏        | 59/478 [16:05<1:44:27, 14.96s/it] 13%|█▎        | 60/478 [16:20<1:44:13, 14.96s/it]                                                  {'loss': '0.2793', 'grad_norm': '7.062', 'learning_rate': '1.996e-05', 'ppl': '1.322', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.88', 'tokens/total': 1920000, 'tokens/trainable': 37712, 'epoch': '0.25'}
 13%|█▎        | 60/478 [16:20<1:44:13, 14.96s/it] 13%|█▎        | 61/478 [16:35<1:43:51, 14.94s/it]                                                  {'loss': '0.2065', 'grad_norm': '5.125', 'learning_rate': '1.996e-05', 'ppl': '1.229', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '23.98', 'tokens/total': 1952000, 'tokens/trainable': 38426, 'epoch': '0.2542'}
 13%|█▎        | 61/478 [16:35<1:43:51, 14.94s/it] 13%|█▎        | 62/478 [16:50<1:43:40, 14.95s/it]                                                  {'loss': '0.1821', 'grad_norm': '5.156', 'learning_rate': '1.995e-05', 'ppl': '1.2', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.67', 'tokens/total': 1984000, 'tokens/trainable': 39044, 'epoch': '0.2583'}
 13%|█▎        | 62/478 [16:50<1:43:40, 14.95s/it] 13%|█▎        | 63/478 [17:05<1:43:26, 14.96s/it]                                                  {'loss': '0.2495', 'grad_norm': '6.406', 'learning_rate': '1.994e-05', 'ppl': '1.283', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.31', 'tokens/total': 2016000, 'tokens/trainable': 39651, 'epoch': '0.2625'}
 13%|█▎        | 63/478 [17:05<1:43:26, 14.96s/it] 13%|█▎        | 64/478 [17:20<1:43:12, 14.96s/it]                                                  {'loss': '0.2432', 'grad_norm': '5.375', 'learning_rate': '1.993e-05', 'ppl': '1.275', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.92', 'tokens/total': 2048000, 'tokens/trainable': 40336, 'epoch': '0.2667'}
 13%|█▎        | 64/478 [17:20<1:43:12, 14.96s/it] 14%|█▎        | 65/478 [17:35<1:42:58, 14.96s/it]                                                  {'loss': '0.1392', 'grad_norm': '4.469', 'learning_rate': '1.992e-05', 'ppl': '1.149', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '18.03', 'tokens/total': 2080000, 'tokens/trainable': 40875, 'epoch': '0.2708'}
 14%|█▎        | 65/478 [17:35<1:42:58, 14.96s/it] 14%|█▍        | 66/478 [17:50<1:42:45, 14.96s/it]                                                  {'loss': '0.2319', 'grad_norm': '5.562', 'learning_rate': '1.991e-05', 'ppl': '1.261', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '21.24', 'tokens/total': 2112000, 'tokens/trainable': 41510, 'epoch': '0.275'}
 14%|█▍        | 66/478 [17:50<1:42:45, 14.96s/it] 14%|█▍        | 67/478 [18:05<1:42:31, 14.97s/it]                                                  {'loss': '0.189', 'grad_norm': '4.938', 'learning_rate': '1.99e-05', 'ppl': '1.208', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.07', 'tokens/total': 2144000, 'tokens/trainable': 42170, 'epoch': '0.2792'}
 14%|█▍        | 67/478 [18:05<1:42:31, 14.97s/it] 14%|█▍        | 68/478 [18:20<1:42:16, 14.97s/it]                                                  {'loss': '0.2231', 'grad_norm': '7.812', 'learning_rate': '1.989e-05', 'ppl': '1.25', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '16.62', 'tokens/total': 2176000, 'tokens/trainable': 42667, 'epoch': '0.2833'}
 14%|█▍        | 68/478 [18:20<1:42:16, 14.97s/it] 14%|█▍        | 69/478 [18:34<1:42:01, 14.97s/it]                                                  {'loss': '0.1948', 'grad_norm': '5.219', 'learning_rate': '1.988e-05', 'ppl': '1.215', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '17.96', 'tokens/total': 2208000, 'tokens/trainable': 43204, 'epoch': '0.2875'}
 14%|█▍        | 69/478 [18:34<1:42:01, 14.97s/it] 15%|█▍        | 70/478 [18:49<1:41:46, 14.97s/it]                                                  {'loss': '0.146', 'grad_norm': '4.156', 'learning_rate': '1.987e-05', 'ppl': '1.157', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.14', 'tokens/total': 2240000, 'tokens/trainable': 43866, 'epoch': '0.2917'}
 15%|█▍        | 70/478 [18:49<1:41:46, 14.97s/it] 15%|█▍        | 71/478 [19:04<1:41:31, 14.97s/it]                                                  {'loss': '0.2588', 'grad_norm': '5.969', 'learning_rate': '1.986e-05', 'ppl': '1.295', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.85', 'tokens/total': 2272000, 'tokens/trainable': 44549, 'epoch': '0.2958'}
 15%|█▍        | 71/478 [19:04<1:41:31, 14.97s/it] 15%|█▌        | 72/478 [19:19<1:41:16, 14.97s/it]                                                  {'loss': '0.1719', 'grad_norm': '4.938', 'learning_rate': '1.985e-05', 'ppl': '1.188', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.27', 'tokens/total': 2304000, 'tokens/trainable': 45155, 'epoch': '0.3'}
 15%|█▌        | 72/478 [19:19<1:41:16, 14.97s/it] 15%|█▌        | 73/478 [19:34<1:41:02, 14.97s/it]                                                  {'loss': '0.1885', 'grad_norm': '5.781', 'learning_rate': '1.983e-05', 'ppl': '1.207', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '19.49', 'tokens/total': 2336000, 'tokens/trainable': 45738, 'epoch': '0.3042'}
 15%|█▌        | 73/478 [19:34<1:41:02, 14.97s/it] 15%|█▌        | 74/478 [19:49<1:40:43, 14.96s/it]                                                  {'loss': '0.2168', 'grad_norm': '6.219', 'learning_rate': '1.982e-05', 'ppl': '1.242', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '21.08', 'tokens/total': 2368000, 'tokens/trainable': 46367, 'epoch': '0.3083'}
 15%|█▌        | 74/478 [19:49<1:40:43, 14.96s/it] 16%|█▌        | 75/478 [20:04<1:40:29, 14.96s/it]                                                  {'loss': '0.1719', 'grad_norm': '5.375', 'learning_rate': '1.981e-05', 'ppl': '1.188', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '19.66', 'tokens/total': 2400000, 'tokens/trainable': 46955, 'epoch': '0.3125'}
 16%|█▌        | 75/478 [20:04<1:40:29, 14.96s/it] 16%|█▌        | 76/478 [20:19<1:40:16, 14.97s/it]                                                  {'loss': '0.2334', 'grad_norm': '6.188', 'learning_rate': '1.979e-05', 'ppl': '1.263', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.3', 'tokens/total': 2432000, 'tokens/trainable': 47562, 'epoch': '0.3167'}
 16%|█▌        | 76/478 [20:19<1:40:16, 14.97s/it] 16%|█▌        | 77/478 [20:34<1:40:01, 14.97s/it]                                                  {'loss': '0.3389', 'grad_norm': '7.906', 'learning_rate': '1.978e-05', 'ppl': '1.403', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '19', 'tokens/total': 2464000, 'tokens/trainable': 48130, 'epoch': '0.3208'}
 16%|█▌        | 77/478 [20:34<1:40:01, 14.97s/it] 16%|█▋        | 78/478 [20:49<1:39:50, 14.98s/it]                                                  {'loss': '0.1982', 'grad_norm': '4.531', 'learning_rate': '1.976e-05', 'ppl': '1.219', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '21.93', 'tokens/total': 2496000, 'tokens/trainable': 48787, 'epoch': '0.325'}
 16%|█▋        | 78/478 [20:49<1:39:50, 14.98s/it] 17%|█▋        | 79/478 [21:04<1:39:34, 14.97s/it]                                                  {'loss': '0.2163', 'grad_norm': '5.594', 'learning_rate': '1.975e-05', 'ppl': '1.241', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '21', 'tokens/total': 2528000, 'tokens/trainable': 49415, 'epoch': '0.3292'}
 17%|█▋        | 79/478 [21:04<1:39:34, 14.97s/it] 17%|█▋        | 80/478 [21:19<1:39:15, 14.96s/it]                                                  {'loss': '0.1899', 'grad_norm': '4.906', 'learning_rate': '1.973e-05', 'ppl': '1.209', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '21.99', 'tokens/total': 2560000, 'tokens/trainable': 50071, 'epoch': '0.3333'}
 17%|█▋        | 80/478 [21:19<1:39:15, 14.96s/it] 17%|█▋        | 81/478 [21:34<1:38:58, 14.96s/it]                                                  {'loss': '0.2627', 'grad_norm': '6.5', 'learning_rate': '1.971e-05', 'ppl': '1.3', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.61', 'tokens/total': 2592000, 'tokens/trainable': 50746, 'epoch': '0.3375'}
 17%|█▋        | 81/478 [21:34<1:38:58, 14.96s/it] 17%|█▋        | 82/478 [21:49<1:38:46, 14.96s/it]                                                  {'loss': '0.1797', 'grad_norm': '5.031', 'learning_rate': '1.969e-05', 'ppl': '1.197', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.52', 'tokens/total': 2624000, 'tokens/trainable': 51360, 'epoch': '0.3417'}
 17%|█▋        | 82/478 [21:49<1:38:46, 14.96s/it] 17%|█▋        | 83/478 [22:04<1:38:30, 14.96s/it]                                                  {'loss': '0.1689', 'grad_norm': '7.531', 'learning_rate': '1.968e-05', 'ppl': '1.184', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '16.22', 'tokens/total': 2656000, 'tokens/trainable': 51845, 'epoch': '0.3458'}
 17%|█▋        | 83/478 [22:04<1:38:30, 14.96s/it] 18%|█▊        | 84/478 [22:19<1:38:15, 14.96s/it]                                                  {'loss': '0.1943', 'grad_norm': '5.688', 'learning_rate': '1.966e-05', 'ppl': '1.214', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.68', 'tokens/total': 2688000, 'tokens/trainable': 52523, 'epoch': '0.35'}
 18%|█▊        | 84/478 [22:19<1:38:15, 14.96s/it] 18%|█▊        | 85/478 [22:34<1:38:00, 14.96s/it]                                                  {'loss': '0.1772', 'grad_norm': '5.344', 'learning_rate': '1.964e-05', 'ppl': '1.194', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '19.51', 'tokens/total': 2720000, 'tokens/trainable': 53106, 'epoch': '0.3542'}
 18%|█▊        | 85/478 [22:34<1:38:00, 14.96s/it] 18%|█▊        | 86/478 [22:49<1:37:47, 14.97s/it]                                                  {'loss': '0.2812', 'grad_norm': '7.031', 'learning_rate': '1.962e-05', 'ppl': '1.325', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '17.42', 'tokens/total': 2752000, 'tokens/trainable': 53627, 'epoch': '0.3583'}
 18%|█▊        | 86/478 [22:49<1:37:47, 14.97s/it] 18%|█▊        | 87/478 [23:04<1:37:32, 14.97s/it]                                                  {'loss': '0.1904', 'grad_norm': '5.344', 'learning_rate': '1.96e-05', 'ppl': '1.21', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.8', 'tokens/total': 2784000, 'tokens/trainable': 54249, 'epoch': '0.3625'}
 18%|█▊        | 87/478 [23:04<1:37:32, 14.97s/it] 18%|█▊        | 88/478 [23:19<1:37:16, 14.97s/it]                                                  {'loss': '0.2026', 'grad_norm': '5.344', 'learning_rate': '1.958e-05', 'ppl': '1.225', 'memory/max_active (GiB)': '45.17', 'memory/max_allocated (GiB)': '45.17', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '23.22', 'tokens/total': 2816000, 'tokens/trainable': 54943, 'epoch': '0.3667'}
 18%|█▊        | 88/478 [23:19<1:37:16, 14.97s/it] 19%|█▊        | 89/478 [23:34<1:37:02, 14.97s/it]                                                  {'loss': '0.23', 'grad_norm': '7.156', 'learning_rate': '1.956e-05', 'ppl': '1.259', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '21.03', 'tokens/total': 2848000, 'tokens/trainable': 55572, 'epoch': '0.3708'}
 19%|█▊        | 89/478 [23:34<1:37:02, 14.97s/it] 19%|█▉        | 90/478 [23:49<1:36:47, 14.97s/it]                                                  {'loss': '0.2236', 'grad_norm': '5.625', 'learning_rate': '1.954e-05', 'ppl': '1.251', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '23.91', 'tokens/total': 2880000, 'tokens/trainable': 56287, 'epoch': '0.375'}
 19%|█▉        | 90/478 [23:49<1:36:47, 14.97s/it] 19%|█▉        | 91/478 [24:04<1:36:32, 14.97s/it]                                                  {'loss': '0.2427', 'grad_norm': '6.5', 'learning_rate': '1.951e-05', 'ppl': '1.275', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.5', 'tokens/total': 2912000, 'tokens/trainable': 56900, 'epoch': '0.3792'}
 19%|█▉        | 91/478 [24:04<1:36:32, 14.97s/it] 19%|█▉        | 92/478 [24:19<1:36:17, 14.97s/it]                                                  {'loss': '0.2754', 'grad_norm': '6.562', 'learning_rate': '1.949e-05', 'ppl': '1.317', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '18.6', 'tokens/total': 2944000, 'tokens/trainable': 57456, 'epoch': '0.3833'}
 19%|█▉        | 92/478 [24:19<1:36:17, 14.97s/it] 19%|█▉        | 93/478 [24:34<1:36:02, 14.97s/it]                                                  {'loss': '0.2202', 'grad_norm': '5.438', 'learning_rate': '1.947e-05', 'ppl': '1.246', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '21.98', 'tokens/total': 2976000, 'tokens/trainable': 58113, 'epoch': '0.3875'}
 19%|█▉        | 93/478 [24:34<1:36:02, 14.97s/it] 20%|█▉        | 94/478 [24:49<1:35:46, 14.97s/it]                                                  {'loss': '0.1865', 'grad_norm': '5.844', 'learning_rate': '1.944e-05', 'ppl': '1.205', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '17.47', 'tokens/total': 3008000, 'tokens/trainable': 58635, 'epoch': '0.3917'}
 20%|█▉        | 94/478 [24:49<1:35:46, 14.97s/it] 20%|█▉        | 95/478 [25:04<1:35:33, 14.97s/it]                                                  {'loss': '0.2656', 'grad_norm': '6.562', 'learning_rate': '1.942e-05', 'ppl': '1.304', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '16.09', 'tokens/total': 3040000, 'tokens/trainable': 59116, 'epoch': '0.3958'}
 20%|█▉        | 95/478 [25:04<1:35:33, 14.97s/it] 20%|██        | 96/478 [25:19<1:35:18, 14.97s/it]                                                  {'loss': '0.1924', 'grad_norm': '6.875', 'learning_rate': '1.939e-05', 'ppl': '1.212', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '16.69', 'tokens/total': 3072000, 'tokens/trainable': 59615, 'epoch': '0.4'}
 20%|██        | 96/478 [25:19<1:35:18, 14.97s/it] 20%|██        | 97/478 [25:34<1:35:04, 14.97s/it]                                                  {'loss': '0.2271', 'grad_norm': '7.094', 'learning_rate': '1.937e-05', 'ppl': '1.255', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.66', 'tokens/total': 3104000, 'tokens/trainable': 60233, 'epoch': '0.4042'}
 20%|██        | 97/478 [25:34<1:35:04, 14.97s/it] 21%|██        | 98/478 [25:49<1:34:49, 14.97s/it]                                                  {'loss': '0.2163', 'grad_norm': '5.062', 'learning_rate': '1.934e-05', 'ppl': '1.241', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.87', 'tokens/total': 3136000, 'tokens/trainable': 60857, 'epoch': '0.4083'}
 21%|██        | 98/478 [25:49<1:34:49, 14.97s/it] 21%|██        | 99/478 [26:03<1:34:33, 14.97s/it]                                                  {'loss': '0.1621', 'grad_norm': '4.938', 'learning_rate': '1.932e-05', 'ppl': '1.176', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.88', 'tokens/total': 3168000, 'tokens/trainable': 61541, 'epoch': '0.4125'}
 21%|██        | 99/478 [26:04<1:34:33, 14.97s/it] 21%|██        | 100/478 [26:18<1:34:18, 14.97s/it]                                                   {'loss': '0.2051', 'grad_norm': '5.219', 'learning_rate': '1.929e-05', 'ppl': '1.228', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.57', 'tokens/total': 3200000, 'tokens/trainable': 62156, 'epoch': '0.4167'}
 21%|██        | 100/478 [26:18<1:34:18, 14.97s/it] 21%|██        | 101/478 [26:33<1:34:02, 14.97s/it]                                                   {'loss': '0.2344', 'grad_norm': '6.625', 'learning_rate': '1.926e-05', 'ppl': '1.264', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '18.6', 'tokens/total': 3232000, 'tokens/trainable': 62712, 'epoch': '0.4208'}
 21%|██        | 101/478 [26:33<1:34:02, 14.97s/it] 21%|██▏       | 102/478 [26:48<1:33:46, 14.96s/it]                                                   {'loss': '0.1924', 'grad_norm': '5.812', 'learning_rate': '1.924e-05', 'ppl': '1.212', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '21.25', 'tokens/total': 3264000, 'tokens/trainable': 63347, 'epoch': '0.425'}
 21%|██▏       | 102/478 [26:48<1:33:46, 14.96s/it] 22%|██▏       | 103/478 [27:03<1:33:31, 14.96s/it]                                                   {'loss': '0.1865', 'grad_norm': '5', 'learning_rate': '1.921e-05', 'ppl': '1.205', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '19.4', 'tokens/total': 3296000, 'tokens/trainable': 63927, 'epoch': '0.4292'}
 22%|██▏       | 103/478 [27:03<1:33:31, 14.96s/it] 22%|██▏       | 104/478 [27:18<1:33:17, 14.97s/it]                                                   {'loss': '0.2471', 'grad_norm': '5.438', 'learning_rate': '1.918e-05', 'ppl': '1.28', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.87', 'tokens/total': 3328000, 'tokens/trainable': 64551, 'epoch': '0.4333'}
 22%|██▏       | 104/478 [27:18<1:33:17, 14.97s/it] 22%|██▏       | 105/478 [27:33<1:33:02, 14.97s/it]                                                   {'loss': '0.1685', 'grad_norm': '5.125', 'learning_rate': '1.915e-05', 'ppl': '1.183', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '21.44', 'tokens/total': 3360000, 'tokens/trainable': 65192, 'epoch': '0.4375'}
 22%|██▏       | 105/478 [27:33<1:33:02, 14.97s/it] 22%|██▏       | 106/478 [27:48<1:32:47, 14.97s/it]                                                   {'loss': '0.1958', 'grad_norm': '5.219', 'learning_rate': '1.912e-05', 'ppl': '1.216', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '23.29', 'tokens/total': 3392000, 'tokens/trainable': 65888, 'epoch': '0.4417'}
 22%|██▏       | 106/478 [27:48<1:32:47, 14.97s/it] 22%|██▏       | 107/478 [28:03<1:32:32, 14.97s/it]                                                   {'loss': '0.2749', 'grad_norm': '6.094', 'learning_rate': '1.909e-05', 'ppl': '1.316', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.91', 'tokens/total': 3424000, 'tokens/trainable': 66513, 'epoch': '0.4458'}
 22%|██▏       | 107/478 [28:03<1:32:32, 14.97s/it] 23%|██▎       | 108/478 [28:18<1:32:17, 14.97s/it]                                                   {'loss': '0.1978', 'grad_norm': '4.844', 'learning_rate': '1.906e-05', 'ppl': '1.219', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.4', 'tokens/total': 3456000, 'tokens/trainable': 67123, 'epoch': '0.45'}
 23%|██▎       | 108/478 [28:18<1:32:17, 14.97s/it] 23%|██▎       | 109/478 [28:33<1:32:02, 14.97s/it]                                                   {'loss': '0.2812', 'grad_norm': '6.406', 'learning_rate': '1.903e-05', 'ppl': '1.325', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '18.87', 'tokens/total': 3488000, 'tokens/trainable': 67687, 'epoch': '0.4542'}
 23%|██▎       | 109/478 [28:33<1:32:02, 14.97s/it] 23%|██▎       | 110/478 [28:48<1:31:47, 14.97s/it]                                                   {'loss': '0.2578', 'grad_norm': '6.062', 'learning_rate': '1.9e-05', 'ppl': '1.294', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '21.61', 'tokens/total': 3520000, 'tokens/trainable': 68333, 'epoch': '0.4583'}
 23%|██▎       | 110/478 [28:48<1:31:47, 14.97s/it] 23%|██▎       | 111/478 [29:03<1:31:39, 14.99s/it]                                                   {'loss': '0.1865', 'grad_norm': '4.438', 'learning_rate': '1.896e-05', 'ppl': '1.205', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '21.88', 'tokens/total': 3552000, 'tokens/trainable': 68990, 'epoch': '0.4625'}
 23%|██▎       | 111/478 [29:03<1:31:39, 14.99s/it] 23%|██▎       | 112/478 [29:18<1:31:22, 14.98s/it]                                                   {'loss': '0.2842', 'grad_norm': '6.375', 'learning_rate': '1.893e-05', 'ppl': '1.329', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '18.87', 'tokens/total': 3584000, 'tokens/trainable': 69554, 'epoch': '0.4667'}
 23%|██▎       | 112/478 [29:18<1:31:22, 14.98s/it] 24%|██▎       | 113/478 [29:33<1:30:58, 14.96s/it]                                                   {'loss': '0.29', 'grad_norm': '7.375', 'learning_rate': '1.89e-05', 'ppl': '1.336', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '21.7', 'tokens/total': 3616000, 'tokens/trainable': 70200, 'epoch': '0.4708'}
 24%|██▎       | 113/478 [29:33<1:30:58, 14.96s/it] 24%|██▍       | 114/478 [29:48<1:30:44, 14.96s/it]                                                   {'loss': '0.1782', 'grad_norm': '9.062', 'learning_rate': '1.886e-05', 'ppl': '1.195', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.59', 'tokens/total': 3648000, 'tokens/trainable': 70875, 'epoch': '0.475'}
 24%|██▍       | 114/478 [29:48<1:30:44, 14.96s/it] 24%|██▍       | 115/478 [30:03<1:30:30, 14.96s/it]                                                   {'loss': '0.2656', 'grad_norm': '5.594', 'learning_rate': '1.883e-05', 'ppl': '1.304', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.14', 'tokens/total': 3680000, 'tokens/trainable': 71477, 'epoch': '0.4792'}
 24%|██▍       | 115/478 [30:03<1:30:30, 14.96s/it] 24%|██▍       | 116/478 [30:18<1:30:16, 14.96s/it]                                                   {'loss': '0.2251', 'grad_norm': '4.688', 'learning_rate': '1.88e-05', 'ppl': '1.252', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '18.03', 'tokens/total': 3712000, 'tokens/trainable': 72016, 'epoch': '0.4833'}
 24%|██▍       | 116/478 [30:18<1:30:16, 14.96s/it] 24%|██▍       | 117/478 [30:33<1:30:01, 14.96s/it]                                                   {'loss': '0.2319', 'grad_norm': '5.438', 'learning_rate': '1.876e-05', 'ppl': '1.261', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '21.68', 'tokens/total': 3744000, 'tokens/trainable': 72664, 'epoch': '0.4875'}
 24%|██▍       | 117/478 [30:33<1:30:01, 14.96s/it] 25%|██▍       | 118/478 [30:48<1:29:49, 14.97s/it]                                                   {'loss': '0.1895', 'grad_norm': '5.281', 'learning_rate': '1.873e-05', 'ppl': '1.209', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '21.71', 'tokens/total': 3776000, 'tokens/trainable': 73314, 'epoch': '0.4917'}
 25%|██▍       | 118/478 [30:48<1:29:49, 14.97s/it] 25%|██▍       | 119/478 [31:03<1:29:33, 14.97s/it]                                                   {'loss': '0.1973', 'grad_norm': '6.719', 'learning_rate': '1.869e-05', 'ppl': '1.218', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.98', 'tokens/total': 3808000, 'tokens/trainable': 74001, 'epoch': '0.4958'}
 25%|██▍       | 119/478 [31:03<1:29:33, 14.97s/it] 25%|██▌       | 120/478 [31:18<1:29:18, 14.97s/it]                                                   {'loss': '0.2339', 'grad_norm': '4.969', 'learning_rate': '1.865e-05', 'ppl': '1.264', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.25', 'tokens/total': 3840000, 'tokens/trainable': 74666, 'epoch': '0.5'}
 25%|██▌       | 120/478 [31:18<1:29:18, 14.97s/it][2026-04-17 02:40:42,441] [INFO] [axolotl.core.trainers.base] Running evaluation step...
[2026-04-17 02:40:50,590] [INFO] [axolotl.utils.samplers.multipack] gather_len_batches: [54, 54]

  0%|          | 0/27 [00:00<?, ?it/s]
  7%|▋         | 2/27 [00:02<00:35,  1.41s/it]
 11%|█         | 3/27 [00:05<00:47,  1.97s/it]
 15%|█▍        | 4/27 [00:08<00:51,  2.26s/it]
 19%|█▊        | 5/27 [00:11<00:53,  2.43s/it]
 22%|██▏       | 6/27 [00:13<00:53,  2.54s/it]
 26%|██▌       | 7/27 [00:16<00:52,  2.61s/it]
 30%|██▉       | 8/27 [00:19<00:50,  2.65s/it]
 33%|███▎      | 9/27 [00:22<00:48,  2.68s/it]
 37%|███▋      | 10/27 [00:24<00:45,  2.71s/it]
 41%|████      | 11/27 [00:27<00:43,  2.72s/it]
 44%|████▍     | 12/27 [00:30<00:39,  2.64s/it]
 48%|████▊     | 13/27 [00:33<00:38,  2.76s/it]
 52%|█████▏    | 14/27 [00:35<00:35,  2.76s/it]
 56%|█████▌    | 15/27 [00:38<00:33,  2.76s/it]
 59%|█████▉    | 16/27 [00:41<00:30,  2.76s/it]
 63%|██████▎   | 17/27 [00:44<00:27,  2.76s/it]
 67%|██████▋   | 18/27 [00:46<00:24,  2.76s/it]
 70%|███████   | 19/27 [00:49<00:22,  2.76s/it]
 74%|███████▍  | 20/27 [00:52<00:19,  2.75s/it]
 78%|███████▊  | 21/27 [00:54<00:16,  2.67s/it]
 81%|████████▏ | 22/27 [00:57<00:13,  2.78s/it]
 85%|████████▌ | 23/27 [01:00<00:11,  2.77s/it]
 89%|████████▉ | 24/27 [01:03<00:08,  2.77s/it]
 93%|█████████▎| 25/27 [01:06<00:05,  2.76s/it]
 96%|█████████▋| 26/27 [01:08<00:02,  2.76s/it]
100%|██████████| 27/27 [01:11<00:00,  2.79s/it]                                                   
                                               {'eval_loss': '0.2288', 'eval_runtime': '75.12', 'eval_samples_per_second': '2.782', 'eval_steps_per_second': '1.398', 'eval_ppl': '1.257', 'memory/max_active (GiB)': '34.91', 'memory/max_allocated (GiB)': '34.91', 'memory/device_reserved (GiB)': '59.04', 'epoch': '0.5', 'tokens/train_per_sec_per_gpu': '0'}
 25%|██▌       | 120/478 [32:41<1:29:18, 14.97s/it]
100%|██████████| 27/27 [01:13<00:00,  2.79s/it]
                                                25%|██▌       | 121/478 [32:56<3:57:42, 39.95s/it]                                                   {'loss': '0.2651', 'grad_norm': '6.125', 'learning_rate': '1.862e-05', 'ppl': '1.304', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '17.61', 'tokens/total': 3872000, 'tokens/trainable': 75192, 'epoch': '0.5042'}
 25%|██▌       | 121/478 [32:56<3:57:42, 39.95s/it] 26%|██▌       | 122/478 [33:11<3:12:32, 32.45s/it]                                                   {'loss': '0.2109', 'grad_norm': '5.812', 'learning_rate': '1.858e-05', 'ppl': '1.235', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.68', 'tokens/total': 3904000, 'tokens/trainable': 75780, 'epoch': '0.5083'}
 26%|██▌       | 122/478 [33:11<3:12:32, 32.45s/it] 26%|██▌       | 123/478 [33:26<2:40:56, 27.20s/it]                                                   {'loss': '0.2168', 'grad_norm': '5.438', 'learning_rate': '1.854e-05', 'ppl': '1.242', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '17.31', 'tokens/total': 3936000, 'tokens/trainable': 76297, 'epoch': '0.5125'}
 26%|██▌       | 123/478 [33:26<2:40:56, 27.20s/it] 26%|██▌       | 124/478 [33:41<2:18:48, 23.53s/it]                                                   {'loss': '0.2153', 'grad_norm': '5.344', 'learning_rate': '1.85e-05', 'ppl': '1.24', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '17.41', 'tokens/total': 3968000, 'tokens/trainable': 76817, 'epoch': '0.5167'}
 26%|██▌       | 124/478 [33:41<2:18:48, 23.53s/it] 26%|██▌       | 125/478 [33:56<2:03:16, 20.95s/it]                                                   {'loss': '0.2075', 'grad_norm': '8.938', 'learning_rate': '1.847e-05', 'ppl': '1.231', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.12', 'tokens/total': 4000000, 'tokens/trainable': 77418, 'epoch': '0.5208'}
 26%|██▌       | 125/478 [33:56<2:03:16, 20.95s/it] 26%|██▋       | 126/478 [34:11<1:52:21, 19.15s/it]                                                   {'loss': '0.2148', 'grad_norm': '5.344', 'learning_rate': '1.843e-05', 'ppl': '1.24', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.9', 'tokens/total': 4032000, 'tokens/trainable': 78072, 'epoch': '0.525'}
 26%|██▋       | 126/478 [34:11<1:52:21, 19.15s/it] 27%|██▋       | 127/478 [34:26<1:44:40, 17.89s/it]                                                   {'loss': '0.2383', 'grad_norm': '5.594', 'learning_rate': '1.839e-05', 'ppl': '1.269', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.32', 'tokens/total': 4064000, 'tokens/trainable': 78709, 'epoch': '0.5292'}
 27%|██▋       | 127/478 [34:26<1:44:40, 17.89s/it] 27%|██▋       | 128/478 [34:41<1:39:13, 17.01s/it]                                                   {'loss': '0.2544', 'grad_norm': '5.969', 'learning_rate': '1.835e-05', 'ppl': '1.29', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.15', 'tokens/total': 4096000, 'tokens/trainable': 79251, 'epoch': '0.5333'}
 27%|██▋       | 128/478 [34:41<1:39:13, 17.01s/it] 27%|██▋       | 129/478 [34:56<1:35:21, 16.39s/it]                                                   {'loss': '0.2046', 'grad_norm': '5.156', 'learning_rate': '1.831e-05', 'ppl': '1.227', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.95', 'tokens/total': 4128000, 'tokens/trainable': 79847, 'epoch': '0.5375'}
 27%|██▋       | 129/478 [34:56<1:35:21, 16.39s/it] 27%|██▋       | 130/478 [35:11<1:32:35, 15.96s/it]                                                   {'loss': '0.2026', 'grad_norm': '4.969', 'learning_rate': '1.827e-05', 'ppl': '1.225', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.98', 'tokens/total': 4160000, 'tokens/trainable': 80444, 'epoch': '0.5417'}
 27%|██▋       | 130/478 [35:11<1:32:35, 15.96s/it] 27%|██▋       | 131/478 [35:26<1:30:33, 15.66s/it]                                                   {'loss': '0.2222', 'grad_norm': '5.688', 'learning_rate': '1.823e-05', 'ppl': '1.249', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '17.91', 'tokens/total': 4192000, 'tokens/trainable': 80979, 'epoch': '0.5458'}
 27%|██▋       | 131/478 [35:26<1:30:33, 15.66s/it] 28%|██▊       | 132/478 [35:41<1:29:04, 15.45s/it]                                                   {'loss': '0.2271', 'grad_norm': '5.656', 'learning_rate': '1.818e-05', 'ppl': '1.255', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.49', 'tokens/total': 4224000, 'tokens/trainable': 81531, 'epoch': '0.55'}
 28%|██▊       | 132/478 [35:41<1:29:04, 15.45s/it] 28%|██▊       | 133/478 [35:55<1:27:55, 15.29s/it]                                                   {'loss': '0.2725', 'grad_norm': '4.938', 'learning_rate': '1.814e-05', 'ppl': '1.313', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '23.24', 'tokens/total': 4256000, 'tokens/trainable': 82224, 'epoch': '0.5542'}
 28%|██▊       | 133/478 [35:55<1:27:55, 15.29s/it] 28%|██▊       | 134/478 [36:10<1:27:05, 15.19s/it]                                                   {'loss': '0.2539', 'grad_norm': '6.031', 'learning_rate': '1.81e-05', 'ppl': '1.289', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '23.04', 'tokens/total': 4288000, 'tokens/trainable': 82912, 'epoch': '0.5583'}
 28%|██▊       | 134/478 [36:10<1:27:05, 15.19s/it] 28%|██▊       | 135/478 [36:25<1:26:25, 15.12s/it]                                                   {'loss': '0.2368', 'grad_norm': '5.312', 'learning_rate': '1.806e-05', 'ppl': '1.267', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.44', 'tokens/total': 4320000, 'tokens/trainable': 83582, 'epoch': '0.5625'}
 28%|██▊       | 135/478 [36:25<1:26:25, 15.12s/it] 28%|██▊       | 136/478 [36:40<1:25:53, 15.07s/it]                                                   {'loss': '0.2617', 'grad_norm': '7.75', 'learning_rate': '1.801e-05', 'ppl': '1.299', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.19', 'tokens/total': 4352000, 'tokens/trainable': 84185, 'epoch': '0.5667'}
 28%|██▊       | 136/478 [36:40<1:25:53, 15.07s/it] 29%|██▊       | 137/478 [36:55<1:25:24, 15.03s/it]                                                   {'loss': '0.2041', 'grad_norm': '4.906', 'learning_rate': '1.797e-05', 'ppl': '1.226', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.79', 'tokens/total': 4384000, 'tokens/trainable': 84805, 'epoch': '0.5708'}
 29%|██▊       | 137/478 [36:55<1:25:24, 15.03s/it] 29%|██▉       | 138/478 [37:10<1:25:01, 15.01s/it]                                                   {'loss': '0.2881', 'grad_norm': '7.125', 'learning_rate': '1.792e-05', 'ppl': '1.334', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.32', 'tokens/total': 4416000, 'tokens/trainable': 85382, 'epoch': '0.575'}
 29%|██▉       | 138/478 [37:10<1:25:01, 15.01s/it] 29%|██▉       | 139/478 [37:25<1:24:41, 14.99s/it]                                                   {'loss': '0.1733', 'grad_norm': '4.5', 'learning_rate': '1.788e-05', 'ppl': '1.189', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.12', 'tokens/total': 4448000, 'tokens/trainable': 85983, 'epoch': '0.5792'}
 29%|██▉       | 139/478 [37:25<1:24:41, 14.99s/it] 29%|██▉       | 140/478 [37:40<1:24:22, 14.98s/it]                                                   {'loss': '0.2285', 'grad_norm': '5.844', 'learning_rate': '1.783e-05', 'ppl': '1.257', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '16.68', 'tokens/total': 4480000, 'tokens/trainable': 86481, 'epoch': '0.5833'}
 29%|██▉       | 140/478 [37:40<1:24:22, 14.98s/it] 29%|██▉       | 141/478 [37:55<1:23:58, 14.95s/it]                                                   {'loss': '0.2144', 'grad_norm': '5.438', 'learning_rate': '1.779e-05', 'ppl': '1.239', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.69', 'tokens/total': 4512000, 'tokens/trainable': 87126, 'epoch': '0.5875'}
 29%|██▉       | 141/478 [37:55<1:23:58, 14.95s/it] 30%|██▉       | 142/478 [38:10<1:23:42, 14.95s/it]                                                   {'loss': '0.2754', 'grad_norm': '6.188', 'learning_rate': '1.774e-05', 'ppl': '1.317', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.84', 'tokens/total': 4544000, 'tokens/trainable': 87778, 'epoch': '0.5917'}
 30%|██▉       | 142/478 [38:10<1:23:42, 14.95s/it] 30%|██▉       | 143/478 [38:25<1:23:28, 14.95s/it]                                                   {'loss': '0.2407', 'grad_norm': '5.375', 'learning_rate': '1.77e-05', 'ppl': '1.272', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.08', 'tokens/total': 4576000, 'tokens/trainable': 88408, 'epoch': '0.5958'}
 30%|██▉       | 143/478 [38:25<1:23:28, 14.95s/it] 30%|███       | 144/478 [38:40<1:23:14, 14.95s/it]                                                   {'loss': '0.176', 'grad_norm': '5.844', 'learning_rate': '1.765e-05', 'ppl': '1.192', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.75', 'tokens/total': 4608000, 'tokens/trainable': 88968, 'epoch': '0.6'}
 30%|███       | 144/478 [38:40<1:23:14, 14.95s/it] 30%|███       | 145/478 [38:55<1:22:58, 14.95s/it]                                                   {'loss': '0.1982', 'grad_norm': '4.781', 'learning_rate': '1.76e-05', 'ppl': '1.219', 'memory/max_active (GiB)': '45.17', 'memory/max_allocated (GiB)': '45.17', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.33', 'tokens/total': 4640000, 'tokens/trainable': 89545, 'epoch': '0.6042'}
 30%|███       | 145/478 [38:55<1:22:58, 14.95s/it] 31%|███       | 146/478 [39:10<1:22:41, 14.94s/it]                                                   {'loss': '0.1758', 'grad_norm': '5.562', 'learning_rate': '1.756e-05', 'ppl': '1.192', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.85', 'tokens/total': 4672000, 'tokens/trainable': 90107, 'epoch': '0.6083'}
 31%|███       | 146/478 [39:10<1:22:41, 14.94s/it] 31%|███       | 147/478 [39:25<1:22:27, 14.95s/it]                                                   {'loss': '0.2261', 'grad_norm': '5.438', 'learning_rate': '1.751e-05', 'ppl': '1.254', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.66', 'tokens/total': 4704000, 'tokens/trainable': 90724, 'epoch': '0.6125'}
 31%|███       | 147/478 [39:25<1:22:27, 14.95s/it] 31%|███       | 148/478 [39:40<1:22:12, 14.95s/it]                                                   {'loss': '0.2046', 'grad_norm': '5.438', 'learning_rate': '1.746e-05', 'ppl': '1.227', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.3', 'tokens/total': 4736000, 'tokens/trainable': 91390, 'epoch': '0.6167'}
 31%|███       | 148/478 [39:40<1:22:12, 14.95s/it] 31%|███       | 149/478 [39:55<1:21:58, 14.95s/it]                                                   {'loss': '0.2109', 'grad_norm': '5.375', 'learning_rate': '1.741e-05', 'ppl': '1.235', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.79', 'tokens/total': 4768000, 'tokens/trainable': 91981, 'epoch': '0.6208'}
 31%|███       | 149/478 [39:55<1:21:58, 14.95s/it] 31%|███▏      | 150/478 [40:10<1:21:43, 14.95s/it]                                                   {'loss': '0.208', 'grad_norm': '5.812', 'learning_rate': '1.736e-05', 'ppl': '1.231', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.79', 'tokens/total': 4800000, 'tokens/trainable': 92542, 'epoch': '0.625'}
 31%|███▏      | 150/478 [40:10<1:21:43, 14.95s/it] 32%|███▏      | 151/478 [40:24<1:21:28, 14.95s/it]                                                   {'loss': '0.2383', 'grad_norm': '5.625', 'learning_rate': '1.731e-05', 'ppl': '1.269', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.89', 'tokens/total': 4832000, 'tokens/trainable': 93106, 'epoch': '0.6292'}
 32%|███▏      | 151/478 [40:24<1:21:28, 14.95s/it] 32%|███▏      | 152/478 [40:39<1:21:13, 14.95s/it]                                                   {'loss': '0.2788', 'grad_norm': '7.312', 'learning_rate': '1.726e-05', 'ppl': '1.322', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.64', 'tokens/total': 4864000, 'tokens/trainable': 93782, 'epoch': '0.6333'}
 32%|███▏      | 152/478 [40:39<1:21:13, 14.95s/it] 32%|███▏      | 153/478 [40:54<1:21:02, 14.96s/it]                                                   {'loss': '0.2266', 'grad_norm': '5.344', 'learning_rate': '1.721e-05', 'ppl': '1.254', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.77', 'tokens/total': 4896000, 'tokens/trainable': 94464, 'epoch': '0.6375'}
 32%|███▏      | 153/478 [40:54<1:21:02, 14.96s/it] 32%|███▏      | 154/478 [41:09<1:20:47, 14.96s/it]                                                   {'loss': '0.1919', 'grad_norm': '4.656', 'learning_rate': '1.716e-05', 'ppl': '1.212', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.29', 'tokens/total': 4928000, 'tokens/trainable': 95070, 'epoch': '0.6417'}
 32%|███▏      | 154/478 [41:09<1:20:47, 14.96s/it] 32%|███▏      | 155/478 [41:24<1:20:32, 14.96s/it]                                                   {'loss': '0.2554', 'grad_norm': '5.844', 'learning_rate': '1.711e-05', 'ppl': '1.291', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.57', 'tokens/total': 4960000, 'tokens/trainable': 95655, 'epoch': '0.6458'}
 32%|███▏      | 155/478 [41:24<1:20:32, 14.96s/it] 33%|███▎      | 156/478 [41:39<1:20:14, 14.95s/it]                                                   {'loss': '0.2178', 'grad_norm': '4.375', 'learning_rate': '1.706e-05', 'ppl': '1.243', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '23.26', 'tokens/total': 4992000, 'tokens/trainable': 96349, 'epoch': '0.65'}
 33%|███▎      | 156/478 [41:39<1:20:14, 14.95s/it] 33%|███▎      | 157/478 [41:54<1:19:59, 14.95s/it]                                                   {'loss': '0.2354', 'grad_norm': '4.625', 'learning_rate': '1.701e-05', 'ppl': '1.265', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.3', 'tokens/total': 5024000, 'tokens/trainable': 97015, 'epoch': '0.6542'}
 33%|███▎      | 157/478 [41:54<1:19:59, 14.95s/it] 33%|███▎      | 158/478 [42:09<1:19:44, 14.95s/it]                                                   {'loss': '0.2778', 'grad_norm': '5.625', 'learning_rate': '1.695e-05', 'ppl': '1.32', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.6', 'tokens/total': 5056000, 'tokens/trainable': 97660, 'epoch': '0.6583'}
 33%|███▎      | 158/478 [42:09<1:19:44, 14.95s/it] 33%|███▎      | 159/478 [42:24<1:19:29, 14.95s/it]                                                   {'loss': '0.1919', 'grad_norm': '6.25', 'learning_rate': '1.69e-05', 'ppl': '1.212', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.29', 'tokens/total': 5088000, 'tokens/trainable': 98206, 'epoch': '0.6625'}
 33%|███▎      | 159/478 [42:24<1:19:29, 14.95s/it] 33%|███▎      | 160/478 [42:39<1:19:15, 14.95s/it]                                                   {'loss': '0.209', 'grad_norm': '4.875', 'learning_rate': '1.685e-05', 'ppl': '1.232', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '23.23', 'tokens/total': 5120000, 'tokens/trainable': 98900, 'epoch': '0.6667'}
 33%|███▎      | 160/478 [42:39<1:19:15, 14.95s/it] 34%|███▎      | 161/478 [42:54<1:18:59, 14.95s/it]                                                   {'loss': '0.2231', 'grad_norm': '5.594', 'learning_rate': '1.68e-05', 'ppl': '1.25', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '17.42', 'tokens/total': 5152000, 'tokens/trainable': 99420, 'epoch': '0.6708'}
 34%|███▎      | 161/478 [42:54<1:18:59, 14.95s/it] 34%|███▍      | 162/478 [43:09<1:18:44, 14.95s/it]                                                   {'loss': '0.2354', 'grad_norm': '4.938', 'learning_rate': '1.674e-05', 'ppl': '1.265', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.09', 'tokens/total': 5184000, 'tokens/trainable': 100050, 'epoch': '0.675'}
 34%|███▍      | 162/478 [43:09<1:18:44, 14.95s/it] 34%|███▍      | 163/478 [43:24<1:18:29, 14.95s/it]                                                   {'loss': '0.2231', 'grad_norm': '4.656', 'learning_rate': '1.669e-05', 'ppl': '1.25', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.64', 'tokens/total': 5216000, 'tokens/trainable': 100696, 'epoch': '0.6792'}
 34%|███▍      | 163/478 [43:24<1:18:29, 14.95s/it] 34%|███▍      | 164/478 [43:39<1:18:14, 14.95s/it]                                                   {'loss': '0.2285', 'grad_norm': '6', 'learning_rate': '1.663e-05', 'ppl': '1.257', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '16.67', 'tokens/total': 5248000, 'tokens/trainable': 101194, 'epoch': '0.6833'}
 34%|███▍      | 164/478 [43:39<1:18:14, 14.95s/it] 35%|███▍      | 165/478 [43:54<1:18:00, 14.95s/it]                                                   {'loss': '0.2773', 'grad_norm': '5.438', 'learning_rate': '1.658e-05', 'ppl': '1.32', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.59', 'tokens/total': 5280000, 'tokens/trainable': 101869, 'epoch': '0.6875'}
 35%|███▍      | 165/478 [43:54<1:18:00, 14.95s/it] 35%|███▍      | 166/478 [44:09<1:17:45, 14.95s/it]                                                   {'loss': '0.1777', 'grad_norm': '5.156', 'learning_rate': '1.652e-05', 'ppl': '1.195', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.58', 'tokens/total': 5312000, 'tokens/trainable': 102424, 'epoch': '0.6917'}
 35%|███▍      | 166/478 [44:09<1:17:45, 14.95s/it] 35%|███▍      | 167/478 [44:24<1:17:28, 14.95s/it]                                                   {'loss': '0.2383', 'grad_norm': '6.719', 'learning_rate': '1.647e-05', 'ppl': '1.269', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.03', 'tokens/total': 5344000, 'tokens/trainable': 103051, 'epoch': '0.6958'}
 35%|███▍      | 167/478 [44:24<1:17:28, 14.95s/it] 35%|███▌      | 168/478 [44:39<1:17:13, 14.95s/it]                                                   {'loss': '0.2178', 'grad_norm': '5.844', 'learning_rate': '1.641e-05', 'ppl': '1.243', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.22', 'tokens/total': 5376000, 'tokens/trainable': 103625, 'epoch': '0.7'}
 35%|███▌      | 168/478 [44:39<1:17:13, 14.95s/it] 35%|███▌      | 169/478 [44:54<1:16:59, 14.95s/it]                                                   {'loss': '0.2124', 'grad_norm': '7.688', 'learning_rate': '1.636e-05', 'ppl': '1.237', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.02', 'tokens/total': 5408000, 'tokens/trainable': 104253, 'epoch': '0.7042'}
 35%|███▌      | 169/478 [44:54<1:16:59, 14.95s/it] 36%|███▌      | 170/478 [45:09<1:16:45, 14.95s/it]                                                   {'loss': '0.1899', 'grad_norm': '5', 'learning_rate': '1.63e-05', 'ppl': '1.209', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.95', 'tokens/total': 5440000, 'tokens/trainable': 104879, 'epoch': '0.7083'}
 36%|███▌      | 170/478 [45:09<1:16:45, 14.95s/it] 36%|███▌      | 171/478 [45:24<1:16:30, 14.95s/it]                                                   {'loss': '0.2271', 'grad_norm': '5.406', 'learning_rate': '1.624e-05', 'ppl': '1.255', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.79', 'tokens/total': 5472000, 'tokens/trainable': 105500, 'epoch': '0.7125'}
 36%|███▌      | 171/478 [45:24<1:16:30, 14.95s/it] 36%|███▌      | 172/478 [45:38<1:16:15, 14.95s/it]                                                   {'loss': '0.3076', 'grad_norm': '6', 'learning_rate': '1.619e-05', 'ppl': '1.36', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.07', 'tokens/total': 5504000, 'tokens/trainable': 106159, 'epoch': '0.7167'}
 36%|███▌      | 172/478 [45:38<1:16:15, 14.95s/it] 36%|███▌      | 173/478 [45:53<1:15:59, 14.95s/it]                                                   {'loss': '0.2109', 'grad_norm': '5.906', 'learning_rate': '1.613e-05', 'ppl': '1.235', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.8', 'tokens/total': 5536000, 'tokens/trainable': 106780, 'epoch': '0.7208'}
 36%|███▌      | 173/478 [45:53<1:15:59, 14.95s/it] 36%|███▋      | 174/478 [46:08<1:15:51, 14.97s/it]                                                   {'loss': '0.2563', 'grad_norm': '5.219', 'learning_rate': '1.607e-05', 'ppl': '1.292', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '24.75', 'tokens/total': 5568000, 'tokens/trainable': 107522, 'epoch': '0.725'}
 36%|███▋      | 174/478 [46:08<1:15:51, 14.97s/it] 37%|███▋      | 175/478 [46:23<1:15:34, 14.97s/it]                                                   {'loss': '0.2666', 'grad_norm': '5.25', 'learning_rate': '1.601e-05', 'ppl': '1.306', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21', 'tokens/total': 5600000, 'tokens/trainable': 108149, 'epoch': '0.7292'}
 37%|███▋      | 175/478 [46:23<1:15:34, 14.97s/it] 37%|███▋      | 176/478 [46:38<1:15:17, 14.96s/it]                                                   {'loss': '0.3096', 'grad_norm': '5.406', 'learning_rate': '1.595e-05', 'ppl': '1.363', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '24.3', 'tokens/total': 5632000, 'tokens/trainable': 108874, 'epoch': '0.7333'}
 37%|███▋      | 176/478 [46:38<1:15:17, 14.96s/it] 37%|███▋      | 177/478 [46:53<1:15:02, 14.96s/it]                                                   {'loss': '0.2432', 'grad_norm': '6.125', 'learning_rate': '1.59e-05', 'ppl': '1.275', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.38', 'tokens/total': 5664000, 'tokens/trainable': 109483, 'epoch': '0.7375'}
 37%|███▋      | 177/478 [46:53<1:15:02, 14.96s/it] 37%|███▋      | 178/478 [47:08<1:14:46, 14.96s/it]                                                   {'loss': '0.1968', 'grad_norm': '4.75', 'learning_rate': '1.584e-05', 'ppl': '1.217', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.87', 'tokens/total': 5696000, 'tokens/trainable': 110166, 'epoch': '0.7417'}
 37%|███▋      | 178/478 [47:08<1:14:46, 14.96s/it] 37%|███▋      | 179/478 [47:23<1:14:31, 14.96s/it]                                                   {'loss': '0.1934', 'grad_norm': '5.656', 'learning_rate': '1.578e-05', 'ppl': '1.213', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '17.58', 'tokens/total': 5728000, 'tokens/trainable': 110691, 'epoch': '0.7458'}
 37%|███▋      | 179/478 [47:23<1:14:31, 14.96s/it] 38%|███▊      | 180/478 [47:38<1:14:16, 14.95s/it]                                                   {'loss': '0.2036', 'grad_norm': '4.781', 'learning_rate': '1.572e-05', 'ppl': '1.226', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.46', 'tokens/total': 5760000, 'tokens/trainable': 111272, 'epoch': '0.75'}
 38%|███▊      | 180/478 [47:38<1:14:16, 14.95s/it] 38%|███▊      | 181/478 [47:53<1:14:01, 14.95s/it]                                                   {'loss': '0.2769', 'grad_norm': '6.219', 'learning_rate': '1.566e-05', 'ppl': '1.319', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.86', 'tokens/total': 5792000, 'tokens/trainable': 111895, 'epoch': '0.7542'}
 38%|███▊      | 181/478 [47:53<1:14:01, 14.95s/it] 38%|███▊      | 182/478 [48:08<1:13:46, 14.95s/it]                                                   {'loss': '0.1821', 'grad_norm': '4.594', 'learning_rate': '1.56e-05', 'ppl': '1.2', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.58', 'tokens/total': 5824000, 'tokens/trainable': 112450, 'epoch': '0.7583'}
 38%|███▊      | 182/478 [48:08<1:13:46, 14.95s/it] 38%|███▊      | 183/478 [48:23<1:13:31, 14.95s/it]                                                   {'loss': '0.2163', 'grad_norm': '5.031', 'learning_rate': '1.554e-05', 'ppl': '1.241', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.02', 'tokens/total': 5856000, 'tokens/trainable': 113018, 'epoch': '0.7625'}
 38%|███▊      | 183/478 [48:23<1:13:31, 14.95s/it] 38%|███▊      | 184/478 [48:38<1:13:15, 14.95s/it]                                                   {'loss': '0.1885', 'grad_norm': '4.406', 'learning_rate': '1.548e-05', 'ppl': '1.207', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.1', 'tokens/total': 5888000, 'tokens/trainable': 113648, 'epoch': '0.7667'}
 38%|███▊      | 184/478 [48:38<1:13:15, 14.95s/it] 39%|███▊      | 185/478 [48:53<1:13:01, 14.95s/it]                                                   {'loss': '0.1938', 'grad_norm': '5.812', 'learning_rate': '1.541e-05', 'ppl': '1.214', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '17.74', 'tokens/total': 5920000, 'tokens/trainable': 114178, 'epoch': '0.7708'}
 39%|███▊      | 185/478 [48:53<1:13:01, 14.95s/it] 39%|███▉      | 186/478 [49:08<1:12:46, 14.95s/it]                                                   {'loss': '0.2041', 'grad_norm': '6.5', 'learning_rate': '1.535e-05', 'ppl': '1.226', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.26', 'tokens/total': 5952000, 'tokens/trainable': 114843, 'epoch': '0.775'}
 39%|███▉      | 186/478 [49:08<1:12:46, 14.95s/it] 39%|███▉      | 187/478 [49:23<1:12:31, 14.95s/it]                                                   {'loss': '0.2329', 'grad_norm': '6.531', 'learning_rate': '1.529e-05', 'ppl': '1.262', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.22', 'tokens/total': 5984000, 'tokens/trainable': 115447, 'epoch': '0.7792'}
 39%|███▉      | 187/478 [49:23<1:12:31, 14.95s/it] 39%|███▉      | 188/478 [49:38<1:12:16, 14.95s/it]                                                   {'loss': '0.1675', 'grad_norm': '4.219', 'learning_rate': '1.523e-05', 'ppl': '1.182', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.43', 'tokens/total': 6016000, 'tokens/trainable': 116087, 'epoch': '0.7833'}
 39%|███▉      | 188/478 [49:38<1:12:16, 14.95s/it] 40%|███▉      | 189/478 [49:53<1:12:01, 14.95s/it]                                                   {'loss': '0.1831', 'grad_norm': '6.25', 'learning_rate': '1.517e-05', 'ppl': '1.201', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.99', 'tokens/total': 6048000, 'tokens/trainable': 116714, 'epoch': '0.7875'}
 40%|███▉      | 189/478 [49:53<1:12:01, 14.95s/it] 40%|███▉      | 190/478 [50:08<1:11:47, 14.96s/it]                                                   {'loss': '0.2251', 'grad_norm': '6', 'learning_rate': '1.51e-05', 'ppl': '1.252', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.75', 'tokens/total': 6080000, 'tokens/trainable': 117364, 'epoch': '0.7917'}
 40%|███▉      | 190/478 [50:08<1:11:47, 14.96s/it] 40%|███▉      | 191/478 [50:23<1:11:33, 14.96s/it]                                                   {'loss': '0.2207', 'grad_norm': '5.125', 'learning_rate': '1.504e-05', 'ppl': '1.247', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '23', 'tokens/total': 6112000, 'tokens/trainable': 118052, 'epoch': '0.7958'}
 40%|███▉      | 191/478 [50:23<1:11:33, 14.96s/it] 40%|████      | 192/478 [50:38<1:11:18, 14.96s/it]                                                   {'loss': '0.2251', 'grad_norm': '5.281', 'learning_rate': '1.498e-05', 'ppl': '1.252', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.53', 'tokens/total': 6144000, 'tokens/trainable': 118695, 'epoch': '0.8'}
 40%|████      | 192/478 [50:38<1:11:18, 14.96s/it] 40%|████      | 193/478 [50:53<1:11:06, 14.97s/it]                                                   {'loss': '0.1963', 'grad_norm': '4.969', 'learning_rate': '1.492e-05', 'ppl': '1.217', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.1', 'tokens/total': 6176000, 'tokens/trainable': 119357, 'epoch': '0.8042'}
 40%|████      | 193/478 [50:53<1:11:06, 14.97s/it] 41%|████      | 194/478 [51:08<1:10:44, 14.95s/it]                                                   {'loss': '0.2544', 'grad_norm': '6.562', 'learning_rate': '1.485e-05', 'ppl': '1.29', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.29', 'tokens/total': 6208000, 'tokens/trainable': 120020, 'epoch': '0.8083'}
 41%|████      | 194/478 [51:08<1:10:44, 14.95s/it] 41%|████      | 195/478 [51:22<1:10:29, 14.95s/it]                                                   {'loss': '0.1855', 'grad_norm': '4.75', 'learning_rate': '1.479e-05', 'ppl': '1.204', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.68', 'tokens/total': 6240000, 'tokens/trainable': 120697, 'epoch': '0.8125'}
 41%|████      | 195/478 [51:22<1:10:29, 14.95s/it] 41%|████      | 196/478 [51:37<1:10:13, 14.94s/it]                                                   {'loss': '0.2305', 'grad_norm': '4.844', 'learning_rate': '1.472e-05', 'ppl': '1.259', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.5', 'tokens/total': 6272000, 'tokens/trainable': 121368, 'epoch': '0.8167'}
 41%|████      | 196/478 [51:37<1:10:13, 14.94s/it] 41%|████      | 197/478 [51:52<1:10:00, 14.95s/it]                                                   {'loss': '0.2656', 'grad_norm': '5.969', 'learning_rate': '1.466e-05', 'ppl': '1.304', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.08', 'tokens/total': 6304000, 'tokens/trainable': 121938, 'epoch': '0.8208'}
 41%|████      | 197/478 [51:52<1:10:00, 14.95s/it] 41%|████▏     | 198/478 [52:07<1:09:45, 14.95s/it]                                                   {'loss': '0.2202', 'grad_norm': '6.125', 'learning_rate': '1.46e-05', 'ppl': '1.246', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.7', 'tokens/total': 6336000, 'tokens/trainable': 122586, 'epoch': '0.825'}
 41%|████▏     | 198/478 [52:07<1:09:45, 14.95s/it] 42%|████▏     | 199/478 [52:22<1:09:30, 14.95s/it]                                                   {'loss': '0.2485', 'grad_norm': '7.156', 'learning_rate': '1.453e-05', 'ppl': '1.282', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.35', 'tokens/total': 6368000, 'tokens/trainable': 123164, 'epoch': '0.8292'}
 42%|████▏     | 199/478 [52:22<1:09:30, 14.95s/it] 42%|████▏     | 200/478 [52:37<1:09:16, 14.95s/it]                                                   {'loss': '0.2705', 'grad_norm': '6.5', 'learning_rate': '1.447e-05', 'ppl': '1.311', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.86', 'tokens/total': 6400000, 'tokens/trainable': 123787, 'epoch': '0.8333'}
 42%|████▏     | 200/478 [52:37<1:09:16, 14.95s/it] 42%|████▏     | 201/478 [52:52<1:09:01, 14.95s/it]                                                   {'loss': '0.2153', 'grad_norm': '5.312', 'learning_rate': '1.44e-05', 'ppl': '1.24', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.86', 'tokens/total': 6432000, 'tokens/trainable': 124410, 'epoch': '0.8375'}
 42%|████▏     | 201/478 [52:52<1:09:01, 14.95s/it] 42%|████▏     | 202/478 [53:07<1:08:46, 14.95s/it]                                                   {'loss': '0.2012', 'grad_norm': '5.531', 'learning_rate': '1.433e-05', 'ppl': '1.223', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.95', 'tokens/total': 6464000, 'tokens/trainable': 124976, 'epoch': '0.8417'}
 42%|████▏     | 202/478 [53:07<1:08:46, 14.95s/it] 42%|████▏     | 203/478 [53:22<1:08:32, 14.95s/it]                                                   {'loss': '0.2217', 'grad_norm': '5.812', 'learning_rate': '1.427e-05', 'ppl': '1.248', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.09', 'tokens/total': 6496000, 'tokens/trainable': 125606, 'epoch': '0.8458'}
 42%|████▏     | 203/478 [53:22<1:08:32, 14.95s/it] 43%|████▎     | 204/478 [53:37<1:08:17, 14.95s/it]                                                   {'loss': '0.2534', 'grad_norm': '5.469', 'learning_rate': '1.42e-05', 'ppl': '1.288', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '23.2', 'tokens/total': 6528000, 'tokens/trainable': 126299, 'epoch': '0.85'}
 43%|████▎     | 204/478 [53:37<1:08:17, 14.95s/it] 43%|████▎     | 205/478 [53:52<1:08:00, 14.95s/it]                                                   {'loss': '0.1675', 'grad_norm': '4.281', 'learning_rate': '1.414e-05', 'ppl': '1.182', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.46', 'tokens/total': 6560000, 'tokens/trainable': 126939, 'epoch': '0.8542'}
 43%|████▎     | 205/478 [53:52<1:08:00, 14.95s/it] 43%|████▎     | 206/478 [54:07<1:07:45, 14.95s/it]                                                   {'loss': '0.1685', 'grad_norm': '4.281', 'learning_rate': '1.407e-05', 'ppl': '1.183', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.8', 'tokens/total': 6592000, 'tokens/trainable': 127590, 'epoch': '0.8583'}
 43%|████▎     | 206/478 [54:07<1:07:45, 14.95s/it] 43%|████▎     | 207/478 [54:22<1:07:31, 14.95s/it]                                                   {'loss': '0.229', 'grad_norm': '5.531', 'learning_rate': '1.4e-05', 'ppl': '1.257', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '23.74', 'tokens/total': 6624000, 'tokens/trainable': 128299, 'epoch': '0.8625'}
 43%|████▎     | 207/478 [54:22<1:07:31, 14.95s/it] 44%|████▎     | 208/478 [54:37<1:07:16, 14.95s/it]                                                   {'loss': '0.1836', 'grad_norm': '7.562', 'learning_rate': '1.394e-05', 'ppl': '1.202', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.96', 'tokens/total': 6656000, 'tokens/trainable': 128895, 'epoch': '0.8667'}
 44%|████▎     | 208/478 [54:37<1:07:16, 14.95s/it] 44%|████▎     | 209/478 [54:52<1:06:56, 14.93s/it]                                                   {'loss': '0.2495', 'grad_norm': '5.031', 'learning_rate': '1.387e-05', 'ppl': '1.283', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '23.14', 'tokens/total': 6688000, 'tokens/trainable': 129583, 'epoch': '0.8708'}
 44%|████▎     | 209/478 [54:52<1:06:56, 14.93s/it] 44%|████▍     | 210/478 [55:07<1:06:43, 14.94s/it]                                                   {'loss': '0.1885', 'grad_norm': '4.531', 'learning_rate': '1.38e-05', 'ppl': '1.207', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.06', 'tokens/total': 6720000, 'tokens/trainable': 130212, 'epoch': '0.875'}
 44%|████▍     | 210/478 [55:07<1:06:43, 14.94s/it] 44%|████▍     | 211/478 [55:22<1:07:09, 15.09s/it]                                                   {'loss': '0.1689', 'grad_norm': '4.281', 'learning_rate': '1.373e-05', 'ppl': '1.184', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.63', 'tokens/total': 6752000, 'tokens/trainable': 130880, 'epoch': '0.8792'}
 44%|████▍     | 211/478 [55:22<1:07:09, 15.09s/it] 44%|████▍     | 212/478 [55:37<1:06:38, 15.03s/it]                                                   {'loss': '0.2295', 'grad_norm': '4.906', 'learning_rate': '1.367e-05', 'ppl': '1.258', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.62', 'tokens/total': 6784000, 'tokens/trainable': 131523, 'epoch': '0.8833'}
 44%|████▍     | 212/478 [55:37<1:06:38, 15.03s/it] 45%|████▍     | 213/478 [55:52<1:06:16, 15.00s/it]                                                   {'loss': '0.2222', 'grad_norm': '4.969', 'learning_rate': '1.36e-05', 'ppl': '1.249', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.21', 'tokens/total': 6816000, 'tokens/trainable': 132186, 'epoch': '0.8875'}
 45%|████▍     | 213/478 [55:52<1:06:16, 15.00s/it] 45%|████▍     | 214/478 [56:07<1:05:56, 14.99s/it]                                                   {'loss': '0.2119', 'grad_norm': '5', 'learning_rate': '1.353e-05', 'ppl': '1.236', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '16.88', 'tokens/total': 6848000, 'tokens/trainable': 132690, 'epoch': '0.8917'}
 45%|████▍     | 214/478 [56:07<1:05:56, 14.99s/it] 45%|████▍     | 215/478 [56:22<1:05:38, 14.98s/it]                                                   {'loss': '0.1875', 'grad_norm': '5.219', 'learning_rate': '1.346e-05', 'ppl': '1.206', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.29', 'tokens/total': 6880000, 'tokens/trainable': 133296, 'epoch': '0.8958'}
 45%|████▍     | 215/478 [56:22<1:05:38, 14.98s/it] 45%|████▌     | 216/478 [56:37<1:05:21, 14.97s/it]                                                   {'loss': '0.2109', 'grad_norm': '4.906', 'learning_rate': '1.339e-05', 'ppl': '1.235', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.47', 'tokens/total': 6912000, 'tokens/trainable': 133937, 'epoch': '0.9'}
 45%|████▌     | 216/478 [56:37<1:05:21, 14.97s/it] 45%|████▌     | 217/478 [56:52<1:05:04, 14.96s/it]                                                   {'loss': '0.2026', 'grad_norm': '5.781', 'learning_rate': '1.332e-05', 'ppl': '1.225', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.3', 'tokens/total': 6944000, 'tokens/trainable': 134573, 'epoch': '0.9042'}
 45%|████▌     | 217/478 [56:52<1:05:04, 14.96s/it] 46%|████▌     | 218/478 [57:07<1:04:47, 14.95s/it]                                                   {'loss': '0.1709', 'grad_norm': '3.938', 'learning_rate': '1.326e-05', 'ppl': '1.186', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.43', 'tokens/total': 6976000, 'tokens/trainable': 135242, 'epoch': '0.9083'}
 46%|████▌     | 218/478 [57:07<1:04:47, 14.95s/it] 46%|████▌     | 219/478 [57:22<1:04:32, 14.95s/it]                                                   {'loss': '0.1785', 'grad_norm': '4.594', 'learning_rate': '1.319e-05', 'ppl': '1.195', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.04', 'tokens/total': 7008000, 'tokens/trainable': 135900, 'epoch': '0.9125'}
 46%|████▌     | 219/478 [57:22<1:04:32, 14.95s/it] 46%|████▌     | 220/478 [57:37<1:04:17, 14.95s/it]                                                   {'loss': '0.188', 'grad_norm': '4.688', 'learning_rate': '1.312e-05', 'ppl': '1.207', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.67', 'tokens/total': 7040000, 'tokens/trainable': 136577, 'epoch': '0.9167'}
 46%|████▌     | 220/478 [57:37<1:04:17, 14.95s/it] 46%|████▌     | 221/478 [57:51<1:04:00, 14.94s/it]                                                   {'loss': '0.2163', 'grad_norm': '5.906', 'learning_rate': '1.305e-05', 'ppl': '1.241', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '23.88', 'tokens/total': 7072000, 'tokens/trainable': 137289, 'epoch': '0.9208'}
 46%|████▌     | 221/478 [57:51<1:04:00, 14.94s/it] 46%|████▋     | 222/478 [58:06<1:03:45, 14.95s/it]                                                   {'loss': '0.2183', 'grad_norm': '5.125', 'learning_rate': '1.298e-05', 'ppl': '1.244', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.47', 'tokens/total': 7104000, 'tokens/trainable': 137900, 'epoch': '0.925'}
 46%|████▋     | 222/478 [58:06<1:03:45, 14.95s/it] 47%|████▋     | 223/478 [58:21<1:03:31, 14.95s/it]                                                   {'loss': '0.2695', 'grad_norm': '6.469', 'learning_rate': '1.291e-05', 'ppl': '1.309', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.2', 'tokens/total': 7136000, 'tokens/trainable': 138533, 'epoch': '0.9292'}
 47%|████▋     | 223/478 [58:21<1:03:31, 14.95s/it] 47%|████▋     | 224/478 [58:36<1:03:16, 14.95s/it]                                                   {'loss': '0.1685', 'grad_norm': '5.625', 'learning_rate': '1.284e-05', 'ppl': '1.183', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.9', 'tokens/total': 7168000, 'tokens/trainable': 139157, 'epoch': '0.9333'}
 47%|████▋     | 224/478 [58:36<1:03:16, 14.95s/it] 47%|████▋     | 225/478 [58:51<1:03:02, 14.95s/it]                                                   {'loss': '0.1929', 'grad_norm': '4.969', 'learning_rate': '1.277e-05', 'ppl': '1.213', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.86', 'tokens/total': 7200000, 'tokens/trainable': 139810, 'epoch': '0.9375'}
 47%|████▋     | 225/478 [58:51<1:03:02, 14.95s/it] 47%|████▋     | 226/478 [59:06<1:02:47, 14.95s/it]                                                   {'loss': '0.2168', 'grad_norm': '5.719', 'learning_rate': '1.27e-05', 'ppl': '1.242', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.69', 'tokens/total': 7232000, 'tokens/trainable': 140398, 'epoch': '0.9417'}
 47%|████▋     | 226/478 [59:06<1:02:47, 14.95s/it] 47%|████▋     | 227/478 [59:21<1:02:32, 14.95s/it]                                                   {'loss': '0.2075', 'grad_norm': '5.344', 'learning_rate': '1.263e-05', 'ppl': '1.231', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.47', 'tokens/total': 7264000, 'tokens/trainable': 141069, 'epoch': '0.9458'}
 47%|████▋     | 227/478 [59:21<1:02:32, 14.95s/it] 48%|████▊     | 228/478 [59:36<1:02:17, 14.95s/it]                                                   {'loss': '0.2222', 'grad_norm': '4.969', 'learning_rate': '1.256e-05', 'ppl': '1.249', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.02', 'tokens/total': 7296000, 'tokens/trainable': 141697, 'epoch': '0.95'}
 48%|████▊     | 228/478 [59:36<1:02:17, 14.95s/it] 48%|████▊     | 229/478 [59:51<1:02:02, 14.95s/it]                                                   {'loss': '0.2402', 'grad_norm': '5.469', 'learning_rate': '1.249e-05', 'ppl': '1.272', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '17.82', 'tokens/total': 7328000, 'tokens/trainable': 142229, 'epoch': '0.9542'}
 48%|████▊     | 229/478 [59:51<1:02:02, 14.95s/it] 48%|████▊     | 230/478 [1:00:06<1:01:47, 14.95s/it]                                                     {'loss': '0.2148', 'grad_norm': '5.625', 'learning_rate': '1.242e-05', 'ppl': '1.24', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.49', 'tokens/total': 7360000, 'tokens/trainable': 142841, 'epoch': '0.9583'}
 48%|████▊     | 230/478 [1:00:06<1:01:47, 14.95s/it] 48%|████▊     | 231/478 [1:00:21<1:01:32, 14.95s/it]                                                     {'loss': '0.188', 'grad_norm': '5.344', 'learning_rate': '1.235e-05', 'ppl': '1.207', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.23', 'tokens/total': 7392000, 'tokens/trainable': 143475, 'epoch': '0.9625'}
 48%|████▊     | 231/478 [1:00:21<1:01:32, 14.95s/it] 49%|████▊     | 232/478 [1:00:36<1:01:17, 14.95s/it]                                                     {'loss': '0.2285', 'grad_norm': '5.969', 'learning_rate': '1.228e-05', 'ppl': '1.257', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '17.81', 'tokens/total': 7424000, 'tokens/trainable': 144007, 'epoch': '0.9667'}
 49%|████▊     | 232/478 [1:00:36<1:01:17, 14.95s/it] 49%|████▊     | 233/478 [1:00:51<1:01:05, 14.96s/it]                                                     {'loss': '0.1855', 'grad_norm': '5.375', 'learning_rate': '1.22e-05', 'ppl': '1.204', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.28', 'tokens/total': 7456000, 'tokens/trainable': 144584, 'epoch': '0.9708'}
 49%|████▊     | 233/478 [1:00:51<1:01:05, 14.96s/it] 49%|████▉     | 234/478 [1:01:06<1:00:48, 14.95s/it]                                                     {'loss': '0.1929', 'grad_norm': '4.344', 'learning_rate': '1.213e-05', 'ppl': '1.213', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '23.1', 'tokens/total': 7488000, 'tokens/trainable': 145273, 'epoch': '0.975'}
 49%|████▉     | 234/478 [1:01:06<1:00:48, 14.95s/it] 49%|████▉     | 235/478 [1:01:21<1:00:33, 14.95s/it]                                                     {'loss': '0.1831', 'grad_norm': '5.031', 'learning_rate': '1.206e-05', 'ppl': '1.201', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.3', 'tokens/total': 7520000, 'tokens/trainable': 145879, 'epoch': '0.9792'}
 49%|████▉     | 235/478 [1:01:21<1:00:33, 14.95s/it] 49%|████▉     | 236/478 [1:01:36<1:00:18, 14.95s/it]                                                     {'loss': '0.2319', 'grad_norm': '5.719', 'learning_rate': '1.199e-05', 'ppl': '1.261', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.52', 'tokens/total': 7552000, 'tokens/trainable': 146552, 'epoch': '0.9833'}
 49%|████▉     | 236/478 [1:01:36<1:00:18, 14.95s/it] 50%|████▉     | 237/478 [1:01:51<1:00:03, 14.95s/it]                                                     {'loss': '0.251', 'grad_norm': '5.812', 'learning_rate': '1.192e-05', 'ppl': '1.285', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.9', 'tokens/total': 7584000, 'tokens/trainable': 147206, 'epoch': '0.9875'}
 50%|████▉     | 237/478 [1:01:51<1:00:03, 14.95s/it] 50%|████▉     | 238/478 [1:02:06<59:48, 14.95s/it]                                                     {'loss': '0.2085', 'grad_norm': '5.406', 'learning_rate': '1.185e-05', 'ppl': '1.232', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.62', 'tokens/total': 7616000, 'tokens/trainable': 147792, 'epoch': '0.9917'}
 50%|████▉     | 238/478 [1:02:06<59:48, 14.95s/it] 50%|█████     | 239/478 [1:02:21<59:33, 14.95s/it]                                                   {'loss': '0.2236', 'grad_norm': '6.562', 'learning_rate': '1.178e-05', 'ppl': '1.251', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.16', 'tokens/total': 7648000, 'tokens/trainable': 148424, 'epoch': '0.9958'}
 50%|█████     | 239/478 [1:02:21<59:33, 14.95s/it][2026-04-17 03:11:51,731] [INFO] [axolotl.core.trainers.base] Saving model checkpoint to /workspace/data/outputs/qwen3-4B/fft_magnifi-module-classifier-04-17-relabelled-upsampled/checkpoint-239

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]
Writing model shards: 100%|██████████| 1/1 [00:22<00:00, 22.99s/it]Writing model shards: 100%|██████████| 1/1 [00:22<00:00, 22.99s/it]
 50%|█████     | 240/478 [1:04:39<3:26:42, 52.11s/it]                                                     {'loss': '0.229', 'grad_norm': '5.062', 'learning_rate': '1.17e-05', 'ppl': '1.257', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.67', 'tokens/total': 7680000, 'tokens/trainable': 149041, 'epoch': '1'}
 50%|█████     | 240/478 [1:04:39<3:26:42, 52.11s/it][2026-04-17 03:14:04,074] [INFO] [axolotl.core.trainers.base] Running evaluation step...
[2026-04-17 03:14:12,814] [INFO] [axolotl.utils.samplers.multipack] gather_len_batches: [54, 54]

  0%|          | 0/27 [00:00<?, ?it/s]
  7%|▋         | 2/27 [00:02<00:35,  1.41s/it]
 11%|█         | 3/27 [00:05<00:47,  1.97s/it]
 15%|█▍        | 4/27 [00:08<00:51,  2.26s/it]
 19%|█▊        | 5/27 [00:11<00:53,  2.43s/it]
 22%|██▏       | 6/27 [00:13<00:53,  2.54s/it]
 26%|██▌       | 7/27 [00:16<00:52,  2.61s/it]
 30%|██▉       | 8/27 [00:19<00:50,  2.66s/it]
 33%|███▎      | 9/27 [00:22<00:48,  2.68s/it]
 37%|███▋      | 10/27 [00:24<00:45,  2.70s/it]
 41%|████      | 11/27 [00:27<00:43,  2.72s/it]
 44%|████▍     | 12/27 [00:30<00:39,  2.64s/it]
 48%|████▊     | 13/27 [00:33<00:38,  2.76s/it]
 52%|█████▏    | 14/27 [00:35<00:35,  2.76s/it]
 56%|█████▌    | 15/27 [00:38<00:33,  2.76s/it]
 59%|█████▉    | 16/27 [00:41<00:30,  2.76s/it]
 63%|██████▎   | 17/27 [00:44<00:27,  2.75s/it]
 67%|██████▋   | 18/27 [00:46<00:24,  2.75s/it]
 70%|███████   | 19/27 [00:49<00:22,  2.75s/it]
 74%|███████▍  | 20/27 [00:52<00:19,  2.75s/it]
 78%|███████▊  | 21/27 [00:54<00:16,  2.67s/it]
 81%|████████▏ | 22/27 [00:57<00:13,  2.78s/it]
 85%|████████▌ | 23/27 [01:00<00:11,  2.77s/it]
 89%|████████▉ | 24/27 [01:03<00:08,  2.76s/it]
 93%|█████████▎| 25/27 [01:06<00:05,  2.76s/it]
 96%|█████████▋| 26/27 [01:08<00:02,  2.76s/it]
100%|██████████| 27/27 [01:11<00:00,  2.84s/it]                                                     
                                               {'eval_loss': '0.2166', 'eval_runtime': '75.42', 'eval_samples_per_second': '2.771', 'eval_steps_per_second': '1.392', 'eval_ppl': '1.242', 'memory/max_active (GiB)': '34.91', 'memory/max_allocated (GiB)': '34.91', 'memory/device_reserved (GiB)': '57.54', 'epoch': '1', 'tokens/train_per_sec_per_gpu': '0'}
 50%|█████     | 240/478 [1:06:04<3:26:42, 52.11s/it]
100%|██████████| 27/27 [01:13<00:00,  2.84s/it]
                                                50%|█████     | 241/478 [1:06:20<4:22:47, 66.53s/it]                                                     {'loss': '0.2046', 'grad_norm': '5.375', 'learning_rate': '1.163e-05', 'ppl': '1.227', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.95', 'tokens/total': 7712000, 'tokens/trainable': 149728, 'epoch': '1.004'}
 50%|█████     | 241/478 [1:06:20<4:22:47, 66.53s/it] 51%|█████     | 242/478 [1:06:35<3:20:49, 51.06s/it]                                                     {'loss': '0.2031', 'grad_norm': '5.406', 'learning_rate': '1.156e-05', 'ppl': '1.225', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.85', 'tokens/total': 7744000, 'tokens/trainable': 150351, 'epoch': '1.008'}
 51%|█████     | 242/478 [1:06:35<3:20:49, 51.06s/it] 51%|█████     | 243/478 [1:06:50<2:37:33, 40.23s/it]                                                     {'loss': '0.1758', 'grad_norm': '5', 'learning_rate': '1.149e-05', 'ppl': '1.192', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.47', 'tokens/total': 7776000, 'tokens/trainable': 150903, 'epoch': '1.012'}
 51%|█████     | 243/478 [1:06:50<2:37:33, 40.23s/it] 51%|█████     | 244/478 [1:07:04<2:07:19, 32.65s/it]                                                     {'loss': '0.1514', 'grad_norm': '4.969', 'learning_rate': '1.142e-05', 'ppl': '1.163', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.07', 'tokens/total': 7808000, 'tokens/trainable': 151443, 'epoch': '1.017'}
 51%|█████     | 244/478 [1:07:04<2:07:19, 32.65s/it] 51%|█████▏    | 245/478 [1:07:19<1:46:10, 27.34s/it]                                                     {'loss': '0.186', 'grad_norm': '4.875', 'learning_rate': '1.134e-05', 'ppl': '1.204', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.38', 'tokens/total': 7840000, 'tokens/trainable': 151992, 'epoch': '1.021'}
 51%|█████▏    | 245/478 [1:07:19<1:46:10, 27.34s/it] 51%|█████▏    | 246/478 [1:07:34<1:31:21, 23.63s/it]                                                     {'loss': '0.1411', 'grad_norm': '3.984', 'learning_rate': '1.127e-05', 'ppl': '1.152', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.85', 'tokens/total': 7872000, 'tokens/trainable': 152615, 'epoch': '1.025'}
 51%|█████▏    | 246/478 [1:07:34<1:31:21, 23.63s/it] 52%|█████▏    | 247/478 [1:07:49<1:20:56, 21.02s/it]                                                     {'loss': '0.2158', 'grad_norm': '6.188', 'learning_rate': '1.12e-05', 'ppl': '1.241', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.35', 'tokens/total': 7904000, 'tokens/trainable': 153193, 'epoch': '1.029'}
 52%|█████▏    | 247/478 [1:07:49<1:20:56, 21.02s/it] 52%|█████▏    | 248/478 [1:08:04<1:13:37, 19.20s/it]                                                     {'loss': '0.1768', 'grad_norm': '4.625', 'learning_rate': '1.113e-05', 'ppl': '1.193', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.75', 'tokens/total': 7936000, 'tokens/trainable': 153783, 'epoch': '1.033'}
 52%|█████▏    | 248/478 [1:08:04<1:13:37, 19.20s/it] 52%|█████▏    | 249/478 [1:08:19<1:08:26, 17.93s/it]                                                     {'loss': '0.1704', 'grad_norm': '4.625', 'learning_rate': '1.105e-05', 'ppl': '1.186', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.35', 'tokens/total': 7968000, 'tokens/trainable': 154451, 'epoch': '1.038'}
 52%|█████▏    | 249/478 [1:08:19<1:08:26, 17.93s/it] 52%|█████▏    | 250/478 [1:08:34<1:04:44, 17.04s/it]                                                     {'loss': '0.186', 'grad_norm': '4.688', 'learning_rate': '1.098e-05', 'ppl': '1.204', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.79', 'tokens/total': 8000000, 'tokens/trainable': 155072, 'epoch': '1.042'}
 52%|█████▏    | 250/478 [1:08:34<1:04:44, 17.04s/it] 53%|█████▎    | 251/478 [1:08:49<1:02:06, 16.42s/it]                                                     {'loss': '0.1641', 'grad_norm': '4.125', 'learning_rate': '1.091e-05', 'ppl': '1.178', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.54', 'tokens/total': 8032000, 'tokens/trainable': 155656, 'epoch': '1.046'}
 53%|█████▎    | 251/478 [1:08:49<1:02:06, 16.42s/it] 53%|█████▎    | 252/478 [1:09:04<1:00:10, 15.98s/it]                                                     {'loss': '0.1631', 'grad_norm': '4.5', 'learning_rate': '1.084e-05', 'ppl': '1.177', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.02', 'tokens/total': 8064000, 'tokens/trainable': 156254, 'epoch': '1.05'}
 53%|█████▎    | 252/478 [1:09:04<1:00:10, 15.98s/it] 53%|█████▎    | 253/478 [1:09:19<58:45, 15.67s/it]                                                     {'loss': '0.2163', 'grad_norm': '4.969', 'learning_rate': '1.076e-05', 'ppl': '1.241', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.45', 'tokens/total': 8096000, 'tokens/trainable': 156805, 'epoch': '1.054'}
 53%|█████▎    | 253/478 [1:09:19<58:45, 15.67s/it] 53%|█████▎    | 254/478 [1:09:34<57:42, 15.46s/it]                                                   {'loss': '0.1792', 'grad_norm': '4.625', 'learning_rate': '1.069e-05', 'ppl': '1.196', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.95', 'tokens/total': 8128000, 'tokens/trainable': 157371, 'epoch': '1.058'}
 53%|█████▎    | 254/478 [1:09:34<57:42, 15.46s/it] 53%|█████▎    | 255/478 [1:09:49<56:53, 15.31s/it]                                                   {'loss': '0.1528', 'grad_norm': '5.031', 'learning_rate': '1.062e-05', 'ppl': '1.165', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.46', 'tokens/total': 8160000, 'tokens/trainable': 158042, 'epoch': '1.062'}
 53%|█████▎    | 255/478 [1:09:49<56:53, 15.31s/it] 54%|█████▎    | 256/478 [1:10:04<56:14, 15.20s/it]                                                   {'loss': '0.1448', 'grad_norm': '4.219', 'learning_rate': '1.055e-05', 'ppl': '1.156', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.79', 'tokens/total': 8192000, 'tokens/trainable': 158693, 'epoch': '1.067'}
 54%|█████▎    | 256/478 [1:10:04<56:14, 15.20s/it] 54%|█████▍    | 257/478 [1:10:19<55:41, 15.12s/it]                                                   {'loss': '0.09375', 'grad_norm': '3.844', 'learning_rate': '1.047e-05', 'ppl': '1.098', 'memory/max_active (GiB)': '45.17', 'memory/max_allocated (GiB)': '45.17', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '23.17', 'tokens/total': 8224000, 'tokens/trainable': 159384, 'epoch': '1.071'}
 54%|█████▍    | 257/478 [1:10:19<55:41, 15.12s/it] 54%|█████▍    | 258/478 [1:10:34<55:16, 15.07s/it]                                                   {'loss': '0.1543', 'grad_norm': '4.625', 'learning_rate': '1.04e-05', 'ppl': '1.167', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.72', 'tokens/total': 8256000, 'tokens/trainable': 160063, 'epoch': '1.075'}
 54%|█████▍    | 258/478 [1:10:34<55:16, 15.07s/it] 54%|█████▍    | 259/478 [1:10:49<54:54, 15.05s/it]                                                   {'loss': '0.187', 'grad_norm': '5.406', 'learning_rate': '1.033e-05', 'ppl': '1.206', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.78', 'tokens/total': 8288000, 'tokens/trainable': 160625, 'epoch': '1.079'}
 54%|█████▍    | 259/478 [1:10:49<54:54, 15.05s/it] 54%|█████▍    | 260/478 [1:11:04<54:33, 15.02s/it]                                                   {'loss': '0.1433', 'grad_norm': '4.312', 'learning_rate': '1.026e-05', 'ppl': '1.154', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.33', 'tokens/total': 8320000, 'tokens/trainable': 161262, 'epoch': '1.083'}
 54%|█████▍    | 260/478 [1:11:04<54:33, 15.02s/it] 55%|█████▍    | 261/478 [1:11:19<54:49, 15.16s/it]                                                   {'loss': '0.1436', 'grad_norm': '4.25', 'learning_rate': '1.018e-05', 'ppl': '1.154', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.3', 'tokens/total': 8352000, 'tokens/trainable': 161921, 'epoch': '1.087'}
 55%|█████▍    | 261/478 [1:11:19<54:49, 15.16s/it] 55%|█████▍    | 262/478 [1:11:34<54:21, 15.10s/it]                                                   {'loss': '0.168', 'grad_norm': '4.5', 'learning_rate': '1.011e-05', 'ppl': '1.183', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.92', 'tokens/total': 8384000, 'tokens/trainable': 162546, 'epoch': '1.092'}
 55%|█████▍    | 262/478 [1:11:34<54:21, 15.10s/it] 55%|█████▌    | 263/478 [1:11:49<53:55, 15.05s/it]                                                   {'loss': '0.1802', 'grad_norm': '5.188', 'learning_rate': '1.004e-05', 'ppl': '1.197', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.87', 'tokens/total': 8416000, 'tokens/trainable': 163228, 'epoch': '1.096'}
 55%|█████▌    | 263/478 [1:11:49<53:55, 15.05s/it] 55%|█████▌    | 264/478 [1:12:04<53:34, 15.02s/it]                                                   {'loss': '0.1252', 'grad_norm': '4.281', 'learning_rate': '9.964e-06', 'ppl': '1.133', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '16.5', 'tokens/total': 8448000, 'tokens/trainable': 163721, 'epoch': '1.1'}
 55%|█████▌    | 264/478 [1:12:04<53:34, 15.02s/it] 55%|█████▌    | 265/478 [1:12:19<53:15, 15.00s/it]                                                   {'loss': '0.1909', 'grad_norm': '4.844', 'learning_rate': '9.891e-06', 'ppl': '1.21', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.59', 'tokens/total': 8480000, 'tokens/trainable': 164366, 'epoch': '1.104'}
 55%|█████▌    | 265/478 [1:12:19<53:15, 15.00s/it] 56%|█████▌    | 266/478 [1:12:34<52:57, 14.99s/it]                                                   {'loss': '0.1145', 'grad_norm': '4.062', 'learning_rate': '9.818e-06', 'ppl': '1.121', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.35', 'tokens/total': 8512000, 'tokens/trainable': 164944, 'epoch': '1.108'}
 56%|█████▌    | 266/478 [1:12:34<52:57, 14.99s/it] 56%|█████▌    | 267/478 [1:12:49<52:40, 14.98s/it]                                                   {'loss': '0.1013', 'grad_norm': '3.5', 'learning_rate': '9.745e-06', 'ppl': '1.107', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.17', 'tokens/total': 8544000, 'tokens/trainable': 165576, 'epoch': '1.113'}
 56%|█████▌    | 267/478 [1:12:49<52:40, 14.98s/it] 56%|█████▌    | 268/478 [1:13:04<52:22, 14.96s/it]                                                   {'loss': '0.1455', 'grad_norm': '4.281', 'learning_rate': '9.672e-06', 'ppl': '1.157', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '23.01', 'tokens/total': 8576000, 'tokens/trainable': 166262, 'epoch': '1.117'}
 56%|█████▌    | 268/478 [1:13:04<52:22, 14.96s/it] 56%|█████▋    | 269/478 [1:13:19<52:06, 14.96s/it]                                                   {'loss': '0.1101', 'grad_norm': '3.75', 'learning_rate': '9.599e-06', 'ppl': '1.116', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.57', 'tokens/total': 8608000, 'tokens/trainable': 166936, 'epoch': '1.121'}
 56%|█████▋    | 269/478 [1:13:19<52:06, 14.96s/it] 56%|█████▋    | 270/478 [1:13:34<51:51, 14.96s/it]                                                   {'loss': '0.1221', 'grad_norm': '4.031', 'learning_rate': '9.526e-06', 'ppl': '1.13', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '24.2', 'tokens/total': 8640000, 'tokens/trainable': 167659, 'epoch': '1.125'}
 56%|█████▋    | 270/478 [1:13:34<51:51, 14.96s/it] 57%|█████▋    | 271/478 [1:13:49<51:36, 14.96s/it]                                                   {'loss': '0.09717', 'grad_norm': '4', 'learning_rate': '9.454e-06', 'ppl': '1.102', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.16', 'tokens/total': 8672000, 'tokens/trainable': 168291, 'epoch': '1.129'}
 57%|█████▋    | 271/478 [1:13:49<51:36, 14.96s/it] 57%|█████▋    | 272/478 [1:14:04<51:21, 14.96s/it]                                                   {'loss': '0.123', 'grad_norm': '3.75', 'learning_rate': '9.381e-06', 'ppl': '1.131', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.56', 'tokens/total': 8704000, 'tokens/trainable': 168965, 'epoch': '1.133'}
 57%|█████▋    | 272/478 [1:14:04<51:21, 14.96s/it] 57%|█████▋    | 273/478 [1:14:19<51:06, 14.96s/it]                                                   {'loss': '0.1218', 'grad_norm': '4.906', 'learning_rate': '9.308e-06', 'ppl': '1.13', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.79', 'tokens/total': 8736000, 'tokens/trainable': 169586, 'epoch': '1.137'}
 57%|█████▋    | 273/478 [1:14:19<51:06, 14.96s/it] 57%|█████▋    | 274/478 [1:14:34<50:50, 14.96s/it]                                                   {'loss': '0.1606', 'grad_norm': '5.156', 'learning_rate': '9.235e-06', 'ppl': '1.174', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.16', 'tokens/total': 8768000, 'tokens/trainable': 170188, 'epoch': '1.142'}
 57%|█████▋    | 274/478 [1:14:34<50:50, 14.96s/it] 58%|█████▊    | 275/478 [1:14:49<50:35, 14.95s/it]                                                   {'loss': '0.1206', 'grad_norm': '4.062', 'learning_rate': '9.163e-06', 'ppl': '1.128', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '23.04', 'tokens/total': 8800000, 'tokens/trainable': 170876, 'epoch': '1.146'}
 58%|█████▊    | 275/478 [1:14:49<50:35, 14.95s/it] 58%|█████▊    | 276/478 [1:15:04<50:20, 14.95s/it]                                                   {'loss': '0.123', 'grad_norm': '4.781', 'learning_rate': '9.09e-06', 'ppl': '1.131', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.44', 'tokens/total': 8832000, 'tokens/trainable': 171546, 'epoch': '1.15'}
 58%|█████▊    | 276/478 [1:15:04<50:20, 14.95s/it] 58%|█████▊    | 277/478 [1:15:18<50:05, 14.95s/it]                                                   {'loss': '0.08105', 'grad_norm': '3.578', 'learning_rate': '9.018e-06', 'ppl': '1.084', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.29', 'tokens/total': 8864000, 'tokens/trainable': 172122, 'epoch': '1.154'}
 58%|█████▊    | 277/478 [1:15:19<50:05, 14.95s/it] 58%|█████▊    | 278/478 [1:15:33<49:50, 14.95s/it]                                                   {'loss': '0.1572', 'grad_norm': '4.594', 'learning_rate': '8.945e-06', 'ppl': '1.17', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '23.03', 'tokens/total': 8896000, 'tokens/trainable': 172810, 'epoch': '1.158'}
 58%|█████▊    | 278/478 [1:15:33<49:50, 14.95s/it] 58%|█████▊    | 279/478 [1:15:48<49:34, 14.95s/it]                                                   {'loss': '0.09741', 'grad_norm': '3.922', 'learning_rate': '8.873e-06', 'ppl': '1.102', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '24.94', 'tokens/total': 8928000, 'tokens/trainable': 173554, 'epoch': '1.163'}
 58%|█████▊    | 279/478 [1:15:48<49:34, 14.95s/it] 59%|█████▊    | 280/478 [1:16:03<49:20, 14.95s/it]                                                   {'loss': '0.09204', 'grad_norm': '3.609', 'learning_rate': '8.8e-06', 'ppl': '1.096', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.34', 'tokens/total': 8960000, 'tokens/trainable': 174162, 'epoch': '1.167'}
 59%|█████▊    | 280/478 [1:16:03<49:20, 14.95s/it] 59%|█████▉    | 281/478 [1:16:18<49:05, 14.95s/it]                                                   {'loss': '0.09448', 'grad_norm': '3.656', 'learning_rate': '8.728e-06', 'ppl': '1.099', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.42', 'tokens/total': 8992000, 'tokens/trainable': 174802, 'epoch': '1.171'}
 59%|█████▉    | 281/478 [1:16:18<49:05, 14.95s/it] 59%|█████▉    | 282/478 [1:16:33<48:50, 14.95s/it]                                                   {'loss': '0.09424', 'grad_norm': '3.75', 'learning_rate': '8.656e-06', 'ppl': '1.099', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '24.35', 'tokens/total': 9024000, 'tokens/trainable': 175529, 'epoch': '1.175'}
 59%|█████▉    | 282/478 [1:16:33<48:50, 14.95s/it] 59%|█████▉    | 283/478 [1:16:48<48:34, 14.95s/it]                                                   {'loss': '0.1201', 'grad_norm': '4.125', 'learning_rate': '8.583e-06', 'ppl': '1.128', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.32', 'tokens/total': 9056000, 'tokens/trainable': 176195, 'epoch': '1.179'}
 59%|█████▉    | 283/478 [1:16:48<48:34, 14.95s/it] 59%|█████▉    | 284/478 [1:17:03<48:20, 14.95s/it]                                                   {'loss': '0.1104', 'grad_norm': '3.953', 'learning_rate': '8.511e-06', 'ppl': '1.117', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.7', 'tokens/total': 9088000, 'tokens/trainable': 176843, 'epoch': '1.183'}
 59%|█████▉    | 284/478 [1:17:03<48:20, 14.95s/it] 60%|█████▉    | 285/478 [1:17:18<48:05, 14.95s/it]                                                   {'loss': '0.1064', 'grad_norm': '4.281', 'learning_rate': '8.439e-06', 'ppl': '1.112', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.53', 'tokens/total': 9120000, 'tokens/trainable': 177516, 'epoch': '1.188'}
 60%|█████▉    | 285/478 [1:17:18<48:05, 14.95s/it] 60%|█████▉    | 286/478 [1:17:33<47:51, 14.95s/it]                                                   {'loss': '0.1108', 'grad_norm': '4.344', 'learning_rate': '8.367e-06', 'ppl': '1.117', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.88', 'tokens/total': 9152000, 'tokens/trainable': 178140, 'epoch': '1.192'}
 60%|█████▉    | 286/478 [1:17:33<47:51, 14.95s/it] 60%|██████    | 287/478 [1:17:48<47:36, 14.95s/it]                                                   {'loss': '0.09253', 'grad_norm': '3.922', 'learning_rate': '8.295e-06', 'ppl': '1.097', 'memory/max_active (GiB)': '45.17', 'memory/max_allocated (GiB)': '45.17', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.5', 'tokens/total': 9184000, 'tokens/trainable': 178812, 'epoch': '1.196'}
 60%|██████    | 287/478 [1:17:48<47:36, 14.95s/it] 60%|██████    | 288/478 [1:18:03<47:21, 14.96s/it]                                                   {'loss': '0.1304', 'grad_norm': '5.625', 'learning_rate': '8.224e-06', 'ppl': '1.139', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.64', 'tokens/total': 9216000, 'tokens/trainable': 179369, 'epoch': '1.2'}
 60%|██████    | 288/478 [1:18:03<47:21, 14.96s/it] 60%|██████    | 289/478 [1:18:18<47:06, 14.96s/it]                                                   {'loss': '0.1108', 'grad_norm': '4.625', 'learning_rate': '8.152e-06', 'ppl': '1.117', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.58', 'tokens/total': 9248000, 'tokens/trainable': 179984, 'epoch': '1.204'}
 60%|██████    | 289/478 [1:18:18<47:06, 14.96s/it] 61%|██████    | 290/478 [1:18:33<46:51, 14.96s/it]                                                   {'loss': '0.09741', 'grad_norm': '5.594', 'learning_rate': '8.08e-06', 'ppl': '1.102', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.95', 'tokens/total': 9280000, 'tokens/trainable': 180550, 'epoch': '1.208'}
 61%|██████    | 290/478 [1:18:33<46:51, 14.96s/it] 61%|██████    | 291/478 [1:18:48<46:36, 14.95s/it]                                                   {'loss': '0.08936', 'grad_norm': '4.219', 'learning_rate': '8.009e-06', 'ppl': '1.093', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.25', 'tokens/total': 9312000, 'tokens/trainable': 181095, 'epoch': '1.212'}
 61%|██████    | 291/478 [1:18:48<46:36, 14.95s/it] 61%|██████    | 292/478 [1:19:03<46:21, 14.95s/it]                                                   {'loss': '0.134', 'grad_norm': '5.094', 'learning_rate': '7.938e-06', 'ppl': '1.143', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.67', 'tokens/total': 9344000, 'tokens/trainable': 181680, 'epoch': '1.217'}
 61%|██████    | 292/478 [1:19:03<46:21, 14.95s/it] 61%|██████▏   | 293/478 [1:19:18<46:06, 14.95s/it]                                                   {'loss': '0.08301', 'grad_norm': '4.469', 'learning_rate': '7.866e-06', 'ppl': '1.087', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.52', 'tokens/total': 9376000, 'tokens/trainable': 182323, 'epoch': '1.221'}
 61%|██████▏   | 293/478 [1:19:18<46:06, 14.95s/it] 62%|██████▏   | 294/478 [1:19:33<45:50, 14.95s/it]                                                   {'loss': '0.1326', 'grad_norm': '4.75', 'learning_rate': '7.795e-06', 'ppl': '1.142', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.77', 'tokens/total': 9408000, 'tokens/trainable': 182972, 'epoch': '1.225'}
 62%|██████▏   | 294/478 [1:19:33<45:50, 14.95s/it] 62%|██████▏   | 295/478 [1:19:48<45:34, 14.94s/it]                                                   {'loss': '0.09399', 'grad_norm': '5.062', 'learning_rate': '7.724e-06', 'ppl': '1.099', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '23.34', 'tokens/total': 9440000, 'tokens/trainable': 183668, 'epoch': '1.229'}
 62%|██████▏   | 295/478 [1:19:48<45:34, 14.94s/it] 62%|██████▏   | 296/478 [1:20:03<45:19, 14.94s/it]                                                   {'loss': '0.05933', 'grad_norm': '4.625', 'learning_rate': '7.653e-06', 'ppl': '1.061', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.68', 'tokens/total': 9472000, 'tokens/trainable': 184226, 'epoch': '1.233'}
 62%|██████▏   | 296/478 [1:20:03<45:19, 14.94s/it] 62%|██████▏   | 297/478 [1:20:17<45:04, 14.94s/it]                                                   {'loss': '0.1252', 'grad_norm': '5.25', 'learning_rate': '7.582e-06', 'ppl': '1.133', 'memory/max_active (GiB)': '45.17', 'memory/max_allocated (GiB)': '45.17', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '23', 'tokens/total': 9504000, 'tokens/trainable': 184912, 'epoch': '1.238'}
 62%|██████▏   | 297/478 [1:20:17<45:04, 14.94s/it] 62%|██████▏   | 298/478 [1:20:32<44:49, 14.94s/it]                                                   {'loss': '0.1157', 'grad_norm': '5.531', 'learning_rate': '7.512e-06', 'ppl': '1.123', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.55', 'tokens/total': 9536000, 'tokens/trainable': 185466, 'epoch': '1.242'}
 62%|██████▏   | 298/478 [1:20:32<44:49, 14.94s/it] 63%|██████▎   | 299/478 [1:20:47<44:37, 14.96s/it]                                                   {'loss': '0.07788', 'grad_norm': '4.562', 'learning_rate': '7.441e-06', 'ppl': '1.081', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.14', 'tokens/total': 9568000, 'tokens/trainable': 186069, 'epoch': '1.246'}
 63%|██████▎   | 299/478 [1:20:47<44:37, 14.96s/it] 63%|██████▎   | 300/478 [1:21:02<44:22, 14.96s/it]                                                   {'loss': '0.1213', 'grad_norm': '5.469', 'learning_rate': '7.371e-06', 'ppl': '1.129', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.9', 'tokens/total': 9600000, 'tokens/trainable': 186753, 'epoch': '1.25'}
 63%|██████▎   | 300/478 [1:21:02<44:22, 14.96s/it] 63%|██████▎   | 301/478 [1:21:17<44:03, 14.94s/it]                                                   {'loss': '0.08154', 'grad_norm': '3.797', 'learning_rate': '7.301e-06', 'ppl': '1.085', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '24.01', 'tokens/total': 9632000, 'tokens/trainable': 187467, 'epoch': '1.254'}
 63%|██████▎   | 301/478 [1:21:17<44:03, 14.94s/it] 63%|██████▎   | 302/478 [1:21:32<43:50, 14.94s/it]                                                   {'loss': '0.0835', 'grad_norm': '4.406', 'learning_rate': '7.23e-06', 'ppl': '1.087', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.69', 'tokens/total': 9664000, 'tokens/trainable': 188085, 'epoch': '1.258'}
 63%|██████▎   | 302/478 [1:21:32<43:50, 14.94s/it] 63%|██████▎   | 303/478 [1:21:47<43:36, 14.95s/it]                                                   {'loss': '0.1338', 'grad_norm': '5.219', 'learning_rate': '7.16e-06', 'ppl': '1.143', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.31', 'tokens/total': 9696000, 'tokens/trainable': 188692, 'epoch': '1.262'}
 63%|██████▎   | 303/478 [1:21:47<43:36, 14.95s/it] 64%|██████▎   | 304/478 [1:22:02<43:21, 14.95s/it]                                                   {'loss': '0.1152', 'grad_norm': '4.344', 'learning_rate': '7.091e-06', 'ppl': '1.122', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.93', 'tokens/total': 9728000, 'tokens/trainable': 189377, 'epoch': '1.267'}
 64%|██████▎   | 304/478 [1:22:02<43:21, 14.95s/it] 64%|██████▍   | 305/478 [1:22:17<43:06, 14.95s/it]                                                   {'loss': '0.04266', 'grad_norm': '3.094', 'learning_rate': '7.021e-06', 'ppl': '1.044', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.04', 'tokens/total': 9760000, 'tokens/trainable': 189916, 'epoch': '1.271'}
 64%|██████▍   | 305/478 [1:22:17<43:06, 14.95s/it] 64%|██████▍   | 306/478 [1:22:32<42:52, 14.95s/it]                                                   {'loss': '0.09497', 'grad_norm': '4.875', 'learning_rate': '6.951e-06', 'ppl': '1.1', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.25', 'tokens/total': 9792000, 'tokens/trainable': 190551, 'epoch': '1.275'}
 64%|██████▍   | 306/478 [1:22:32<42:52, 14.95s/it] 64%|██████▍   | 307/478 [1:22:47<42:37, 14.95s/it]                                                   {'loss': '0.08691', 'grad_norm': '3.906', 'learning_rate': '6.882e-06', 'ppl': '1.091', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.1', 'tokens/total': 9824000, 'tokens/trainable': 191211, 'epoch': '1.279'}
 64%|██████▍   | 307/478 [1:22:47<42:37, 14.95s/it] 64%|██████▍   | 308/478 [1:23:02<42:22, 14.95s/it]                                                   {'loss': '0.1401', 'grad_norm': '5.781', 'learning_rate': '6.813e-06', 'ppl': '1.15', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '16.63', 'tokens/total': 9856000, 'tokens/trainable': 191708, 'epoch': '1.283'}
 64%|██████▍   | 308/478 [1:23:02<42:22, 14.95s/it] 65%|██████▍   | 309/478 [1:23:17<42:07, 14.96s/it]                                                   {'loss': '0.0658', 'grad_norm': '3.594', 'learning_rate': '6.744e-06', 'ppl': '1.068', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '17.97', 'tokens/total': 9888000, 'tokens/trainable': 192245, 'epoch': '1.288'}
 65%|██████▍   | 309/478 [1:23:17<42:07, 14.96s/it] 65%|██████▍   | 310/478 [1:23:32<41:52, 14.96s/it]                                                   {'loss': '0.07153', 'grad_norm': '3.516', 'learning_rate': '6.675e-06', 'ppl': '1.074', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.16', 'tokens/total': 9920000, 'tokens/trainable': 192907, 'epoch': '1.292'}
 65%|██████▍   | 310/478 [1:23:32<41:52, 14.96s/it] 65%|██████▌   | 311/478 [1:23:47<41:37, 14.96s/it]                                                   {'loss': '0.1274', 'grad_norm': '4.625', 'learning_rate': '6.606e-06', 'ppl': '1.136', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.87', 'tokens/total': 9952000, 'tokens/trainable': 193590, 'epoch': '1.296'}
 65%|██████▌   | 311/478 [1:23:47<41:37, 14.96s/it] 65%|██████▌   | 312/478 [1:24:02<41:23, 14.96s/it]                                                   {'loss': '0.0614', 'grad_norm': '3.797', 'learning_rate': '6.538e-06', 'ppl': '1.063', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.28', 'tokens/total': 9984000, 'tokens/trainable': 194196, 'epoch': '1.3'}
 65%|██████▌   | 312/478 [1:24:02<41:23, 14.96s/it] 65%|██████▌   | 313/478 [1:24:17<41:08, 14.96s/it]                                                   {'loss': '0.06641', 'grad_norm': '5.344', 'learning_rate': '6.47e-06', 'ppl': '1.069', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.52', 'tokens/total': 10016000, 'tokens/trainable': 194779, 'epoch': '1.304'}
 65%|██████▌   | 313/478 [1:24:17<41:08, 14.96s/it] 66%|██████▌   | 314/478 [1:24:32<40:52, 14.95s/it]                                                   {'loss': '0.09595', 'grad_norm': '5.312', 'learning_rate': '6.402e-06', 'ppl': '1.101', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.09', 'tokens/total': 10048000, 'tokens/trainable': 195408, 'epoch': '1.308'}
 66%|██████▌   | 314/478 [1:24:32<40:52, 14.95s/it] 66%|██████▌   | 315/478 [1:24:47<40:37, 14.95s/it]                                                   {'loss': '0.06055', 'grad_norm': '3.203', 'learning_rate': '6.334e-06', 'ppl': '1.062', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.68', 'tokens/total': 10080000, 'tokens/trainable': 195996, 'epoch': '1.312'}
 66%|██████▌   | 315/478 [1:24:47<40:37, 14.95s/it] 66%|██████▌   | 316/478 [1:25:02<40:22, 14.95s/it]                                                   {'loss': '0.08887', 'grad_norm': '3.938', 'learning_rate': '6.266e-06', 'ppl': '1.093', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.32', 'tokens/total': 10112000, 'tokens/trainable': 196603, 'epoch': '1.317'}
 66%|██████▌   | 316/478 [1:25:02<40:22, 14.95s/it] 66%|██████▋   | 317/478 [1:25:17<40:07, 14.95s/it]                                                   {'loss': '0.1387', 'grad_norm': '6.281', 'learning_rate': '6.198e-06', 'ppl': '1.149', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.02', 'tokens/total': 10144000, 'tokens/trainable': 197171, 'epoch': '1.321'}
 66%|██████▋   | 317/478 [1:25:17<40:07, 14.95s/it] 67%|██████▋   | 318/478 [1:25:32<39:52, 14.95s/it]                                                   {'loss': '0.07373', 'grad_norm': '2.984', 'learning_rate': '6.131e-06', 'ppl': '1.077', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22', 'tokens/total': 10176000, 'tokens/trainable': 197828, 'epoch': '1.325'}
 67%|██████▋   | 318/478 [1:25:32<39:52, 14.95s/it] 67%|██████▋   | 319/478 [1:25:46<39:37, 14.95s/it]                                                   {'loss': '0.0835', 'grad_norm': '4.781', 'learning_rate': '6.064e-06', 'ppl': '1.087', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.03', 'tokens/total': 10208000, 'tokens/trainable': 198456, 'epoch': '1.329'}
 67%|██████▋   | 319/478 [1:25:46<39:37, 14.95s/it] 67%|██████▋   | 320/478 [1:26:01<39:21, 14.94s/it]                                                   {'loss': '0.07983', 'grad_norm': '4.688', 'learning_rate': '5.997e-06', 'ppl': '1.083', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22', 'tokens/total': 10240000, 'tokens/trainable': 199112, 'epoch': '1.333'}
 67%|██████▋   | 320/478 [1:26:01<39:21, 14.94s/it] 67%|██████▋   | 321/478 [1:26:16<39:05, 14.94s/it]                                                   {'loss': '0.1011', 'grad_norm': '5.469', 'learning_rate': '5.93e-06', 'ppl': '1.106', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.63', 'tokens/total': 10272000, 'tokens/trainable': 199787, 'epoch': '1.337'}
 67%|██████▋   | 321/478 [1:26:16<39:05, 14.94s/it] 67%|██████▋   | 322/478 [1:26:31<38:51, 14.95s/it]                                                   {'loss': '0.0813', 'grad_norm': '4.906', 'learning_rate': '5.864e-06', 'ppl': '1.085', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.55', 'tokens/total': 10304000, 'tokens/trainable': 200401, 'epoch': '1.342'}
 67%|██████▋   | 322/478 [1:26:31<38:51, 14.95s/it] 68%|██████▊   | 323/478 [1:26:46<38:36, 14.95s/it]                                                   {'loss': '0.06006', 'grad_norm': '3.531', 'learning_rate': '5.798e-06', 'ppl': '1.062', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '16.24', 'tokens/total': 10336000, 'tokens/trainable': 200886, 'epoch': '1.346'}
 68%|██████▊   | 323/478 [1:26:46<38:36, 14.95s/it] 68%|██████▊   | 324/478 [1:27:01<38:22, 14.95s/it]                                                   {'loss': '0.08911', 'grad_norm': '4.469', 'learning_rate': '5.732e-06', 'ppl': '1.093', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.7', 'tokens/total': 10368000, 'tokens/trainable': 201564, 'epoch': '1.35'}
 68%|██████▊   | 324/478 [1:27:01<38:22, 14.95s/it] 68%|██████▊   | 325/478 [1:27:16<38:06, 14.95s/it]                                                   {'loss': '0.06714', 'grad_norm': '4.344', 'learning_rate': '5.666e-06', 'ppl': '1.069', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.53', 'tokens/total': 10400000, 'tokens/trainable': 202147, 'epoch': '1.354'}
 68%|██████▊   | 325/478 [1:27:16<38:06, 14.95s/it] 68%|██████▊   | 326/478 [1:27:31<37:52, 14.95s/it]                                                   {'loss': '0.1201', 'grad_norm': '5.5', 'learning_rate': '5.6e-06', 'ppl': '1.128', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '17.44', 'tokens/total': 10432000, 'tokens/trainable': 202668, 'epoch': '1.358'}
 68%|██████▊   | 326/478 [1:27:31<37:52, 14.95s/it] 68%|██████▊   | 327/478 [1:27:46<37:37, 14.95s/it]                                                   {'loss': '0.09717', 'grad_norm': '4', 'learning_rate': '5.535e-06', 'ppl': '1.102', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.83', 'tokens/total': 10464000, 'tokens/trainable': 203290, 'epoch': '1.363'}
 68%|██████▊   | 327/478 [1:27:46<37:37, 14.95s/it] 69%|██████▊   | 328/478 [1:28:01<37:22, 14.95s/it]                                                   {'loss': '0.08154', 'grad_norm': '4.406', 'learning_rate': '5.47e-06', 'ppl': '1.085', 'memory/max_active (GiB)': '45.17', 'memory/max_allocated (GiB)': '45.17', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '23.24', 'tokens/total': 10496000, 'tokens/trainable': 203984, 'epoch': '1.367'}
 69%|██████▊   | 328/478 [1:28:01<37:22, 14.95s/it] 69%|██████▉   | 329/478 [1:28:16<37:07, 14.95s/it]                                                   {'loss': '0.09863', 'grad_norm': '4.188', 'learning_rate': '5.405e-06', 'ppl': '1.104', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.06', 'tokens/total': 10528000, 'tokens/trainable': 204613, 'epoch': '1.371'}
 69%|██████▉   | 329/478 [1:28:16<37:07, 14.95s/it] 69%|██████▉   | 330/478 [1:28:31<36:52, 14.95s/it]                                                   {'loss': '0.1023', 'grad_norm': '4.531', 'learning_rate': '5.34e-06', 'ppl': '1.108', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '23.94', 'tokens/total': 10560000, 'tokens/trainable': 205328, 'epoch': '1.375'}
 69%|██████▉   | 330/478 [1:28:31<36:52, 14.95s/it] 69%|██████▉   | 331/478 [1:28:46<36:37, 14.95s/it]                                                   {'loss': '0.125', 'grad_norm': '5.469', 'learning_rate': '5.276e-06', 'ppl': '1.133', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.52', 'tokens/total': 10592000, 'tokens/trainable': 205941, 'epoch': '1.379'}
 69%|██████▉   | 331/478 [1:28:46<36:37, 14.95s/it] 69%|██████▉   | 332/478 [1:29:01<36:23, 14.95s/it]                                                   {'loss': '0.09619', 'grad_norm': '4.906', 'learning_rate': '5.212e-06', 'ppl': '1.101', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.62', 'tokens/total': 10624000, 'tokens/trainable': 206497, 'epoch': '1.383'}
 69%|██████▉   | 332/478 [1:29:01<36:23, 14.95s/it] 70%|██████▉   | 333/478 [1:29:16<36:08, 14.95s/it]                                                   {'loss': '0.1157', 'grad_norm': '4.875', 'learning_rate': '5.148e-06', 'ppl': '1.123', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22', 'tokens/total': 10656000, 'tokens/trainable': 207154, 'epoch': '1.387'}
 70%|██████▉   | 333/478 [1:29:16<36:08, 14.95s/it] 70%|██████▉   | 334/478 [1:29:31<35:52, 14.95s/it]                                                   {'loss': '0.07983', 'grad_norm': '5.969', 'learning_rate': '5.084e-06', 'ppl': '1.083', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '17.48', 'tokens/total': 10688000, 'tokens/trainable': 207676, 'epoch': '1.392'}
 70%|██████▉   | 334/478 [1:29:31<35:52, 14.95s/it] 70%|███████   | 335/478 [1:29:46<35:38, 14.95s/it]                                                   {'loss': '0.1125', 'grad_norm': '5.75', 'learning_rate': '5.021e-06', 'ppl': '1.119', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '16.1', 'tokens/total': 10720000, 'tokens/trainable': 208157, 'epoch': '1.396'}
 70%|███████   | 335/478 [1:29:46<35:38, 14.95s/it] 70%|███████   | 336/478 [1:30:01<35:23, 14.95s/it]                                                   {'loss': '0.08423', 'grad_norm': '5.094', 'learning_rate': '4.958e-06', 'ppl': '1.088', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '16.7', 'tokens/total': 10752000, 'tokens/trainable': 208656, 'epoch': '1.4'}
 70%|███████   | 336/478 [1:30:01<35:23, 14.95s/it] 71%|███████   | 337/478 [1:30:16<35:08, 14.95s/it]                                                   {'loss': '0.1116', 'grad_norm': '5.281', 'learning_rate': '4.895e-06', 'ppl': '1.118', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.7', 'tokens/total': 10784000, 'tokens/trainable': 209274, 'epoch': '1.404'}
 71%|███████   | 337/478 [1:30:16<35:08, 14.95s/it] 71%|███████   | 338/478 [1:30:31<34:53, 14.95s/it]                                                   {'loss': '0.07593', 'grad_norm': '4.062', 'learning_rate': '4.833e-06', 'ppl': '1.079', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.89', 'tokens/total': 10816000, 'tokens/trainable': 209898, 'epoch': '1.408'}
 71%|███████   | 338/478 [1:30:31<34:53, 14.95s/it] 71%|███████   | 339/478 [1:30:45<34:39, 14.96s/it]                                                   {'loss': '0.06348', 'grad_norm': '3.125', 'learning_rate': '4.77e-06', 'ppl': '1.066', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.86', 'tokens/total': 10848000, 'tokens/trainable': 210582, 'epoch': '1.413'}
 71%|███████   | 339/478 [1:30:46<34:39, 14.96s/it] 71%|███████   | 340/478 [1:31:00<34:24, 14.96s/it]                                                   {'loss': '0.09473', 'grad_norm': '4.094', 'learning_rate': '4.708e-06', 'ppl': '1.099', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.6', 'tokens/total': 10880000, 'tokens/trainable': 211197, 'epoch': '1.417'}
 71%|███████   | 340/478 [1:31:00<34:24, 14.96s/it] 71%|███████▏  | 341/478 [1:31:15<34:08, 14.96s/it]                                                   {'loss': '0.1062', 'grad_norm': '5.125', 'learning_rate': '4.647e-06', 'ppl': '1.112', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.61', 'tokens/total': 10912000, 'tokens/trainable': 211753, 'epoch': '1.421'}
 71%|███████▏  | 341/478 [1:31:15<34:08, 14.96s/it] 72%|███████▏  | 342/478 [1:31:30<33:54, 14.96s/it]                                                   {'loss': '0.08618', 'grad_norm': '4.469', 'learning_rate': '4.585e-06', 'ppl': '1.09', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.26', 'tokens/total': 10944000, 'tokens/trainable': 212388, 'epoch': '1.425'}
 72%|███████▏  | 342/478 [1:31:30<33:54, 14.96s/it] 72%|███████▏  | 343/478 [1:31:45<33:38, 14.96s/it]                                                   {'loss': '0.07764', 'grad_norm': '5.062', 'learning_rate': '4.524e-06', 'ppl': '1.081', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.42', 'tokens/total': 10976000, 'tokens/trainable': 212968, 'epoch': '1.429'}
 72%|███████▏  | 343/478 [1:31:45<33:38, 14.96s/it] 72%|███████▏  | 344/478 [1:32:00<33:24, 14.96s/it]                                                   {'loss': '0.1353', 'grad_norm': '5.312', 'learning_rate': '4.463e-06', 'ppl': '1.145', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.88', 'tokens/total': 11008000, 'tokens/trainable': 213592, 'epoch': '1.433'}
 72%|███████▏  | 344/478 [1:32:00<33:24, 14.96s/it] 72%|███████▏  | 345/478 [1:32:15<33:08, 14.95s/it]                                                   {'loss': '0.07617', 'grad_norm': '3.844', 'learning_rate': '4.403e-06', 'ppl': '1.079', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.47', 'tokens/total': 11040000, 'tokens/trainable': 214233, 'epoch': '1.438'}
 72%|███████▏  | 345/478 [1:32:15<33:08, 14.95s/it] 72%|███████▏  | 346/478 [1:32:30<32:53, 14.95s/it]                                                   {'loss': '0.08276', 'grad_norm': '4.844', 'learning_rate': '4.342e-06', 'ppl': '1.086', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '23.3', 'tokens/total': 11072000, 'tokens/trainable': 214929, 'epoch': '1.442'}
 72%|███████▏  | 346/478 [1:32:30<32:53, 14.95s/it] 73%|███████▎  | 347/478 [1:32:45<32:38, 14.95s/it]                                                   {'loss': '0.1309', 'grad_norm': '5.031', 'learning_rate': '4.282e-06', 'ppl': '1.14', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.93', 'tokens/total': 11104000, 'tokens/trainable': 215554, 'epoch': '1.446'}
 73%|███████▎  | 347/478 [1:32:45<32:38, 14.95s/it] 73%|███████▎  | 348/478 [1:33:00<32:23, 14.95s/it]                                                   {'loss': '0.0874', 'grad_norm': '4.156', 'learning_rate': '4.223e-06', 'ppl': '1.091', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.43', 'tokens/total': 11136000, 'tokens/trainable': 216164, 'epoch': '1.45'}
 73%|███████▎  | 348/478 [1:33:00<32:23, 14.95s/it] 73%|███████▎  | 349/478 [1:33:15<32:08, 14.95s/it]                                                   {'loss': '0.1606', 'grad_norm': '8', 'learning_rate': '4.164e-06', 'ppl': '1.174', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.89', 'tokens/total': 11168000, 'tokens/trainable': 216728, 'epoch': '1.454'}
 73%|███████▎  | 349/478 [1:33:15<32:08, 14.95s/it] 73%|███████▎  | 350/478 [1:33:30<31:53, 14.95s/it]                                                   {'loss': '0.1328', 'grad_norm': '5.344', 'learning_rate': '4.104e-06', 'ppl': '1.142', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.63', 'tokens/total': 11200000, 'tokens/trainable': 217374, 'epoch': '1.458'}
 73%|███████▎  | 350/478 [1:33:30<31:53, 14.95s/it] 73%|███████▎  | 351/478 [1:33:45<31:39, 14.95s/it]                                                   {'loss': '0.07727', 'grad_norm': '3.109', 'learning_rate': '4.046e-06', 'ppl': '1.08', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.99', 'tokens/total': 11232000, 'tokens/trainable': 218031, 'epoch': '1.462'}
 73%|███████▎  | 351/478 [1:33:45<31:39, 14.95s/it] 74%|███████▎  | 352/478 [1:34:00<31:24, 14.95s/it]                                                   {'loss': '0.1194', 'grad_norm': '5.219', 'learning_rate': '3.987e-06', 'ppl': '1.127', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.88', 'tokens/total': 11264000, 'tokens/trainable': 218595, 'epoch': '1.467'}
 74%|███████▎  | 352/478 [1:34:00<31:24, 14.95s/it] 74%|███████▍  | 353/478 [1:34:15<31:06, 14.94s/it]                                                   {'loss': '0.1445', 'grad_norm': '5.031', 'learning_rate': '3.929e-06', 'ppl': '1.155', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.72', 'tokens/total': 11296000, 'tokens/trainable': 219241, 'epoch': '1.471'}
 74%|███████▍  | 353/478 [1:34:15<31:06, 14.94s/it] 74%|███████▍  | 354/478 [1:34:30<30:52, 14.94s/it]                                                   {'loss': '0.0697', 'grad_norm': '4.438', 'learning_rate': '3.872e-06', 'ppl': '1.072', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.61', 'tokens/total': 11328000, 'tokens/trainable': 219916, 'epoch': '1.475'}
 74%|███████▍  | 354/478 [1:34:30<30:52, 14.94s/it] 74%|███████▍  | 355/478 [1:34:45<30:38, 14.94s/it]                                                   {'loss': '0.1052', 'grad_norm': '5.062', 'learning_rate': '3.814e-06', 'ppl': '1.111', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.15', 'tokens/total': 11360000, 'tokens/trainable': 220518, 'epoch': '1.479'}
 74%|███████▍  | 355/478 [1:34:45<30:38, 14.94s/it] 74%|███████▍  | 356/478 [1:35:00<30:23, 14.95s/it]                                                   {'loss': '0.08936', 'grad_norm': '4.531', 'learning_rate': '3.757e-06', 'ppl': '1.093', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.04', 'tokens/total': 11392000, 'tokens/trainable': 221057, 'epoch': '1.483'}
 74%|███████▍  | 356/478 [1:35:00<30:23, 14.95s/it] 75%|███████▍  | 357/478 [1:35:15<30:08, 14.95s/it]                                                   {'loss': '0.1221', 'grad_norm': '4.969', 'learning_rate': '3.7e-06', 'ppl': '1.13', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.69', 'tokens/total': 11424000, 'tokens/trainable': 221705, 'epoch': '1.488'}
 75%|███████▍  | 357/478 [1:35:15<30:08, 14.95s/it] 75%|███████▍  | 358/478 [1:35:30<29:54, 14.95s/it]                                                   {'loss': '0.09595', 'grad_norm': '4.25', 'learning_rate': '3.644e-06', 'ppl': '1.101', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.76', 'tokens/total': 11456000, 'tokens/trainable': 222355, 'epoch': '1.492'}
 75%|███████▍  | 358/478 [1:35:30<29:54, 14.95s/it] 75%|███████▌  | 359/478 [1:35:45<29:39, 14.95s/it]                                                   {'loss': '0.1006', 'grad_norm': '5.406', 'learning_rate': '3.588e-06', 'ppl': '1.106', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.99', 'tokens/total': 11488000, 'tokens/trainable': 223042, 'epoch': '1.496'}
 75%|███████▌  | 359/478 [1:35:45<29:39, 14.95s/it] 75%|███████▌  | 360/478 [1:35:59<29:24, 14.95s/it]                                                   {'loss': '0.08984', 'grad_norm': '4.344', 'learning_rate': '3.532e-06', 'ppl': '1.094', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.26', 'tokens/total': 11520000, 'tokens/trainable': 223707, 'epoch': '1.5'}
 75%|███████▌  | 360/478 [1:35:59<29:24, 14.95s/it][2026-04-17 03:45:24,126] [INFO] [axolotl.core.trainers.base] Running evaluation step...
[2026-04-17 03:45:31,509] [INFO] [axolotl.utils.samplers.multipack] gather_len_batches: [54, 54]

  0%|          | 0/27 [00:00<?, ?it/s]
  7%|▋         | 2/27 [00:02<00:35,  1.41s/it]
 11%|█         | 3/27 [00:05<00:47,  1.97s/it]
 15%|█▍        | 4/27 [00:08<00:51,  2.26s/it]
 19%|█▊        | 5/27 [00:11<00:53,  2.43s/it]
 22%|██▏       | 6/27 [00:13<00:53,  2.54s/it]
 26%|██▌       | 7/27 [00:16<00:52,  2.61s/it]
 30%|██▉       | 8/27 [00:19<00:50,  2.65s/it]
 33%|███▎      | 9/27 [00:22<00:48,  2.68s/it]
 37%|███▋      | 10/27 [00:24<00:45,  2.70s/it]
 41%|████      | 11/27 [00:27<00:43,  2.72s/it]
 44%|████▍     | 12/27 [00:30<00:39,  2.64s/it]
 48%|████▊     | 13/27 [00:33<00:38,  2.76s/it]
 52%|█████▏    | 14/27 [00:35<00:35,  2.76s/it]
 56%|█████▌    | 15/27 [00:38<00:33,  2.76s/it]
 59%|█████▉    | 16/27 [00:41<00:30,  2.76s/it]
 63%|██████▎   | 17/27 [00:44<00:27,  2.75s/it]
 67%|██████▋   | 18/27 [00:46<00:24,  2.75s/it]
 70%|███████   | 19/27 [00:49<00:22,  2.75s/it]
 74%|███████▍  | 20/27 [00:52<00:19,  2.75s/it]
 78%|███████▊  | 21/27 [00:54<00:15,  2.67s/it]
 81%|████████▏ | 22/27 [00:57<00:13,  2.78s/it]
 85%|████████▌ | 23/27 [01:00<00:11,  2.77s/it]
 89%|████████▉ | 24/27 [01:03<00:08,  2.77s/it]
 93%|█████████▎| 25/27 [01:06<00:05,  2.76s/it]
 96%|█████████▋| 26/27 [01:08<00:02,  2.76s/it]
100%|██████████| 27/27 [01:11<00:00,  2.79s/it]                                                   
                                               {'eval_loss': '0.2251', 'eval_runtime': '75.08', 'eval_samples_per_second': '2.784', 'eval_steps_per_second': '1.398', 'eval_ppl': '1.252', 'memory/max_active (GiB)': '34.91', 'memory/max_allocated (GiB)': '34.91', 'memory/device_reserved (GiB)': '57.54', 'epoch': '1.5', 'tokens/train_per_sec_per_gpu': '0'}
 75%|███████▌  | 360/478 [1:37:22<29:24, 14.95s/it]
100%|██████████| 27/27 [01:13<00:00,  2.79s/it]
                                                76%|███████▌  | 361/478 [1:37:37<1:17:25, 39.71s/it]                                                     {'loss': '0.09912', 'grad_norm': '4.531', 'learning_rate': '3.476e-06', 'ppl': '1.104', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '17.6', 'tokens/total': 11552000, 'tokens/trainable': 224233, 'epoch': '1.504'}
 76%|███████▌  | 361/478 [1:37:37<1:17:25, 39.71s/it] 76%|███████▌  | 362/478 [1:37:52<1:02:24, 32.28s/it]                                                     {'loss': '0.08459', 'grad_norm': '4.062', 'learning_rate': '3.421e-06', 'ppl': '1.088', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '19.69', 'tokens/total': 11584000, 'tokens/trainable': 224821, 'epoch': '1.508'}
 76%|███████▌  | 362/478 [1:37:52<1:02:24, 32.28s/it] 76%|███████▌  | 363/478 [1:38:07<51:55, 27.09s/it]                                                     {'loss': '0.1008', 'grad_norm': '4.812', 'learning_rate': '3.367e-06', 'ppl': '1.106', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '17.31', 'tokens/total': 11616000, 'tokens/trainable': 225338, 'epoch': '1.512'}
 76%|███████▌  | 363/478 [1:38:07<51:55, 27.09s/it] 76%|███████▌  | 364/478 [1:38:22<44:32, 23.45s/it]                                                   {'loss': '0.1174', 'grad_norm': '5.188', 'learning_rate': '3.312e-06', 'ppl': '1.125', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '17.41', 'tokens/total': 11648000, 'tokens/trainable': 225858, 'epoch': '1.517'}
 76%|███████▌  | 364/478 [1:38:22<44:32, 23.45s/it] 76%|███████▋  | 365/478 [1:38:37<39:21, 20.90s/it]                                                   {'loss': '0.1138', 'grad_norm': '5.594', 'learning_rate': '3.258e-06', 'ppl': '1.12', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '20.13', 'tokens/total': 11680000, 'tokens/trainable': 226459, 'epoch': '1.521'}
 76%|███████▋  | 365/478 [1:38:37<39:21, 20.90s/it] 77%|███████▋  | 366/478 [1:38:52<35:40, 19.11s/it]                                                   {'loss': '0.1018', 'grad_norm': '4.125', 'learning_rate': '3.205e-06', 'ppl': '1.107', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.9', 'tokens/total': 11712000, 'tokens/trainable': 227113, 'epoch': '1.525'}
 77%|███████▋  | 366/478 [1:38:52<35:40, 19.11s/it] 77%|███████▋  | 367/478 [1:39:07<33:02, 17.86s/it]                                                   {'loss': '0.09009', 'grad_norm': '4.031', 'learning_rate': '3.151e-06', 'ppl': '1.094', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.33', 'tokens/total': 11744000, 'tokens/trainable': 227750, 'epoch': '1.529'}
 77%|███████▋  | 367/478 [1:39:07<33:02, 17.86s/it] 77%|███████▋  | 368/478 [1:39:22<31:09, 16.99s/it]                                                   {'loss': '0.09216', 'grad_norm': '4.25', 'learning_rate': '3.098e-06', 'ppl': '1.097', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '18.14', 'tokens/total': 11776000, 'tokens/trainable': 228292, 'epoch': '1.533'}
 77%|███████▋  | 368/478 [1:39:22<31:09, 16.99s/it] 77%|███████▋  | 369/478 [1:39:37<29:45, 16.38s/it]                                                   {'loss': '0.115', 'grad_norm': '5.594', 'learning_rate': '3.046e-06', 'ppl': '1.122', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '19.96', 'tokens/total': 11808000, 'tokens/trainable': 228888, 'epoch': '1.538'}
 77%|███████▋  | 369/478 [1:39:37<29:45, 16.38s/it] 77%|███████▋  | 370/478 [1:39:51<28:42, 15.95s/it]                                                   {'loss': '0.09424', 'grad_norm': '4.344', 'learning_rate': '2.994e-06', 'ppl': '1.099', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '20', 'tokens/total': 11840000, 'tokens/trainable': 229485, 'epoch': '1.542'}
 77%|███████▋  | 370/478 [1:39:52<28:42, 15.95s/it] 78%|███████▊  | 371/478 [1:40:06<27:54, 15.65s/it]                                                   {'loss': '0.1067', 'grad_norm': '6.469', 'learning_rate': '2.942e-06', 'ppl': '1.113', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '17.93', 'tokens/total': 11872000, 'tokens/trainable': 230020, 'epoch': '1.546'}
 78%|███████▊  | 371/478 [1:40:06<27:54, 15.65s/it] 78%|███████▊  | 372/478 [1:40:21<27:16, 15.44s/it]                                                   {'loss': '0.1021', 'grad_norm': '5.469', 'learning_rate': '2.89e-06', 'ppl': '1.107', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '18.49', 'tokens/total': 11904000, 'tokens/trainable': 230572, 'epoch': '1.55'}
 78%|███████▊  | 372/478 [1:40:21<27:16, 15.44s/it] 78%|███████▊  | 373/478 [1:40:36<26:44, 15.28s/it]                                                   {'loss': '0.1387', 'grad_norm': '4.656', 'learning_rate': '2.839e-06', 'ppl': '1.149', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '23.25', 'tokens/total': 11936000, 'tokens/trainable': 231265, 'epoch': '1.554'}
 78%|███████▊  | 373/478 [1:40:36<26:44, 15.28s/it] 78%|███████▊  | 374/478 [1:40:51<26:20, 15.20s/it]                                                   {'loss': '0.137', 'grad_norm': '5.344', 'learning_rate': '2.789e-06', 'ppl': '1.147', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '22.97', 'tokens/total': 11968000, 'tokens/trainable': 231953, 'epoch': '1.558'}
 78%|███████▊  | 374/478 [1:40:51<26:20, 15.20s/it] 78%|███████▊  | 375/478 [1:41:06<25:57, 15.12s/it]                                                   {'loss': '0.09937', 'grad_norm': '3.984', 'learning_rate': '2.738e-06', 'ppl': '1.104', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '22.44', 'tokens/total': 12000000, 'tokens/trainable': 232623, 'epoch': '1.562'}
 78%|███████▊  | 375/478 [1:41:06<25:57, 15.12s/it] 79%|███████▊  | 376/478 [1:41:21<25:37, 15.07s/it]                                                   {'loss': '0.116', 'grad_norm': '4.875', 'learning_rate': '2.688e-06', 'ppl': '1.123', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '20.19', 'tokens/total': 12032000, 'tokens/trainable': 233226, 'epoch': '1.567'}
 79%|███████▊  | 376/478 [1:41:21<25:37, 15.07s/it] 79%|███████▉  | 377/478 [1:41:36<25:17, 15.03s/it]                                                   {'loss': '0.1116', 'grad_norm': '5.531', 'learning_rate': '2.639e-06', 'ppl': '1.118', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '20.79', 'tokens/total': 12064000, 'tokens/trainable': 233846, 'epoch': '1.571'}
 79%|███████▉  | 377/478 [1:41:36<25:17, 15.03s/it] 79%|███████▉  | 378/478 [1:41:51<25:00, 15.01s/it]                                                   {'loss': '0.1494', 'grad_norm': '6.625', 'learning_rate': '2.59e-06', 'ppl': '1.161', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '19.32', 'tokens/total': 12096000, 'tokens/trainable': 234423, 'epoch': '1.575'}
 79%|███████▉  | 378/478 [1:41:51<25:00, 15.01s/it] 79%|███████▉  | 379/478 [1:42:06<24:43, 14.99s/it]                                                   {'loss': '0.07104', 'grad_norm': '3.75', 'learning_rate': '2.541e-06', 'ppl': '1.074', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '20.13', 'tokens/total': 12128000, 'tokens/trainable': 235024, 'epoch': '1.579'}
 79%|███████▉  | 379/478 [1:42:06<24:43, 14.99s/it] 79%|███████▉  | 380/478 [1:42:21<24:27, 14.98s/it]                                                   {'loss': '0.09961', 'grad_norm': '5.688', 'learning_rate': '2.493e-06', 'ppl': '1.105', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '16.68', 'tokens/total': 12160000, 'tokens/trainable': 235522, 'epoch': '1.583'}
 79%|███████▉  | 380/478 [1:42:21<24:27, 14.98s/it] 80%|███████▉  | 381/478 [1:42:36<24:10, 14.95s/it]                                                   {'loss': '0.1062', 'grad_norm': '4.531', 'learning_rate': '2.445e-06', 'ppl': '1.112', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.69', 'tokens/total': 12192000, 'tokens/trainable': 236167, 'epoch': '1.587'}
 80%|███████▉  | 381/478 [1:42:36<24:10, 14.95s/it] 80%|███████▉  | 382/478 [1:42:51<23:55, 14.95s/it]                                                   {'loss': '0.1431', 'grad_norm': '5.75', 'learning_rate': '2.397e-06', 'ppl': '1.154', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.84', 'tokens/total': 12224000, 'tokens/trainable': 236819, 'epoch': '1.592'}
 80%|███████▉  | 382/478 [1:42:51<23:55, 14.95s/it] 80%|████████  | 383/478 [1:43:06<23:40, 14.95s/it]                                                   {'loss': '0.1384', 'grad_norm': '5.062', 'learning_rate': '2.35e-06', 'ppl': '1.148', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.09', 'tokens/total': 12256000, 'tokens/trainable': 237449, 'epoch': '1.596'}
 80%|████████  | 383/478 [1:43:06<23:40, 14.95s/it] 80%|████████  | 384/478 [1:43:21<23:25, 14.95s/it]                                                   {'loss': '0.09216', 'grad_norm': '4.125', 'learning_rate': '2.303e-06', 'ppl': '1.097', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '18.75', 'tokens/total': 12288000, 'tokens/trainable': 238009, 'epoch': '1.6'}
 80%|████████  | 384/478 [1:43:21<23:25, 14.95s/it] 81%|████████  | 385/478 [1:43:36<23:10, 14.95s/it]                                                   {'loss': '0.09546', 'grad_norm': '4.75', 'learning_rate': '2.257e-06', 'ppl': '1.1', 'memory/max_active (GiB)': '45.17', 'memory/max_allocated (GiB)': '45.17', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '19.33', 'tokens/total': 12320000, 'tokens/trainable': 238586, 'epoch': '1.604'}
 81%|████████  | 385/478 [1:43:36<23:10, 14.95s/it] 81%|████████  | 386/478 [1:43:51<22:54, 14.94s/it]                                                   {'loss': '0.06152', 'grad_norm': '3.656', 'learning_rate': '2.211e-06', 'ppl': '1.063', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '18.85', 'tokens/total': 12352000, 'tokens/trainable': 239148, 'epoch': '1.608'}
 81%|████████  | 386/478 [1:43:51<22:54, 14.94s/it] 81%|████████  | 387/478 [1:44:06<22:39, 14.94s/it]                                                   {'loss': '0.09692', 'grad_norm': '4.656', 'learning_rate': '2.165e-06', 'ppl': '1.102', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '20.66', 'tokens/total': 12384000, 'tokens/trainable': 239765, 'epoch': '1.613'}
 81%|████████  | 387/478 [1:44:06<22:39, 14.94s/it] 81%|████████  | 388/478 [1:44:20<22:25, 14.95s/it]                                                   {'loss': '0.1025', 'grad_norm': '4.562', 'learning_rate': '2.12e-06', 'ppl': '1.108', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '22.3', 'tokens/total': 12416000, 'tokens/trainable': 240431, 'epoch': '1.617'}
 81%|████████  | 388/478 [1:44:21<22:25, 14.95s/it] 81%|████████▏ | 389/478 [1:44:35<22:10, 14.95s/it]                                                   {'loss': '0.09546', 'grad_norm': '4.188', 'learning_rate': '2.076e-06', 'ppl': '1.1', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '19.79', 'tokens/total': 12448000, 'tokens/trainable': 241022, 'epoch': '1.621'}
 81%|████████▏ | 389/478 [1:44:35<22:10, 14.95s/it] 82%|████████▏ | 390/478 [1:44:50<21:55, 14.95s/it]                                                   {'loss': '0.07373', 'grad_norm': '3.828', 'learning_rate': '2.031e-06', 'ppl': '1.077', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '18.79', 'tokens/total': 12480000, 'tokens/trainable': 241583, 'epoch': '1.625'}
 82%|████████▏ | 390/478 [1:44:50<21:55, 14.95s/it] 82%|████████▏ | 391/478 [1:45:05<21:40, 14.95s/it]                                                   {'loss': '0.09595', 'grad_norm': '3.969', 'learning_rate': '1.988e-06', 'ppl': '1.101', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '18.89', 'tokens/total': 12512000, 'tokens/trainable': 242147, 'epoch': '1.629'}
 82%|████████▏ | 391/478 [1:45:05<21:40, 14.95s/it] 82%|████████▏ | 392/478 [1:45:20<21:25, 14.95s/it]                                                   {'loss': '0.1494', 'grad_norm': '5.188', 'learning_rate': '1.944e-06', 'ppl': '1.161', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '22.65', 'tokens/total': 12544000, 'tokens/trainable': 242823, 'epoch': '1.633'}
 82%|████████▏ | 392/478 [1:45:20<21:25, 14.95s/it] 82%|████████▏ | 393/478 [1:45:35<21:10, 14.95s/it]                                                   {'loss': '0.1069', 'grad_norm': '5.5', 'learning_rate': '1.901e-06', 'ppl': '1.113', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '22.85', 'tokens/total': 12576000, 'tokens/trainable': 243505, 'epoch': '1.637'}
 82%|████████▏ | 393/478 [1:45:35<21:10, 14.95s/it] 82%|████████▏ | 394/478 [1:45:50<20:55, 14.95s/it]                                                   {'loss': '0.08936', 'grad_norm': '5.531', 'learning_rate': '1.859e-06', 'ppl': '1.093', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '20.29', 'tokens/total': 12608000, 'tokens/trainable': 244111, 'epoch': '1.642'}
 82%|████████▏ | 394/478 [1:45:50<20:55, 14.95s/it] 83%|████████▎ | 395/478 [1:46:05<20:40, 14.95s/it]                                                   {'loss': '0.1116', 'grad_norm': '4.312', 'learning_rate': '1.817e-06', 'ppl': '1.118', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '19.6', 'tokens/total': 12640000, 'tokens/trainable': 244696, 'epoch': '1.646'}
 83%|████████▎ | 395/478 [1:46:05<20:40, 14.95s/it] 83%|████████▎ | 396/478 [1:46:20<20:25, 14.94s/it]                                                   {'loss': '0.1196', 'grad_norm': '4.875', 'learning_rate': '1.775e-06', 'ppl': '1.127', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '23.28', 'tokens/total': 12672000, 'tokens/trainable': 245390, 'epoch': '1.65'}
 83%|████████▎ | 396/478 [1:46:20<20:25, 14.94s/it] 83%|████████▎ | 397/478 [1:46:35<20:10, 14.95s/it]                                                   {'loss': '0.1062', 'grad_norm': '3.922', 'learning_rate': '1.734e-06', 'ppl': '1.112', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '22.29', 'tokens/total': 12704000, 'tokens/trainable': 246056, 'epoch': '1.654'}
 83%|████████▎ | 397/478 [1:46:35<20:10, 14.95s/it] 83%|████████▎ | 398/478 [1:46:50<19:55, 14.95s/it]                                                   {'loss': '0.124', 'grad_norm': '4.469', 'learning_rate': '1.693e-06', 'ppl': '1.132', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.6', 'tokens/total': 12736000, 'tokens/trainable': 246701, 'epoch': '1.658'}
 83%|████████▎ | 398/478 [1:46:50<19:55, 14.95s/it] 83%|████████▎ | 399/478 [1:47:05<19:40, 14.95s/it]                                                   {'loss': '0.07373', 'grad_norm': '4.344', 'learning_rate': '1.653e-06', 'ppl': '1.077', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '18.28', 'tokens/total': 12768000, 'tokens/trainable': 247247, 'epoch': '1.663'}
 83%|████████▎ | 399/478 [1:47:05<19:40, 14.95s/it] 84%|████████▎ | 400/478 [1:47:20<19:25, 14.95s/it]                                                   {'loss': '0.08398', 'grad_norm': '3.781', 'learning_rate': '1.613e-06', 'ppl': '1.088', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '23.24', 'tokens/total': 12800000, 'tokens/trainable': 247941, 'epoch': '1.667'}
 84%|████████▎ | 400/478 [1:47:20<19:25, 14.95s/it] 84%|████████▍ | 401/478 [1:47:35<19:11, 14.95s/it]                                                   {'loss': '0.09668', 'grad_norm': '5.562', 'learning_rate': '1.573e-06', 'ppl': '1.102', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '17.41', 'tokens/total': 12832000, 'tokens/trainable': 248461, 'epoch': '1.671'}
 84%|████████▍ | 401/478 [1:47:35<19:11, 14.95s/it] 84%|████████▍ | 402/478 [1:47:50<18:56, 14.95s/it]                                                   {'loss': '0.1016', 'grad_norm': '4.344', 'learning_rate': '1.534e-06', 'ppl': '1.107', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.1', 'tokens/total': 12864000, 'tokens/trainable': 249091, 'epoch': '1.675'}
 84%|████████▍ | 402/478 [1:47:50<18:56, 14.95s/it] 84%|████████▍ | 403/478 [1:48:05<18:41, 14.95s/it]                                                   {'loss': '0.127', 'grad_norm': '5.969', 'learning_rate': '1.496e-06', 'ppl': '1.135', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.63', 'tokens/total': 12896000, 'tokens/trainable': 249737, 'epoch': '1.679'}
 84%|████████▍ | 403/478 [1:48:05<18:41, 14.95s/it] 85%|████████▍ | 404/478 [1:48:20<18:26, 14.95s/it]                                                   {'loss': '0.09387', 'grad_norm': '4.5', 'learning_rate': '1.457e-06', 'ppl': '1.098', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '16.68', 'tokens/total': 12928000, 'tokens/trainable': 250235, 'epoch': '1.683'}
 85%|████████▍ | 404/478 [1:48:20<18:26, 14.95s/it] 85%|████████▍ | 405/478 [1:48:35<18:11, 14.95s/it]                                                   {'loss': '0.1504', 'grad_norm': '6.469', 'learning_rate': '1.42e-06', 'ppl': '1.162', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '22.6', 'tokens/total': 12960000, 'tokens/trainable': 250910, 'epoch': '1.688'}
 85%|████████▍ | 405/478 [1:48:35<18:11, 14.95s/it] 85%|████████▍ | 406/478 [1:48:50<17:56, 14.95s/it]                                                   {'loss': '0.08569', 'grad_norm': '5.438', 'learning_rate': '1.383e-06', 'ppl': '1.089', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '18.58', 'tokens/total': 12992000, 'tokens/trainable': 251465, 'epoch': '1.692'}
 85%|████████▍ | 406/478 [1:48:50<17:56, 14.95s/it] 85%|████████▌ | 407/478 [1:49:04<17:41, 14.94s/it]                                                   {'loss': '0.1079', 'grad_norm': '4.562', 'learning_rate': '1.346e-06', 'ppl': '1.114', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.03', 'tokens/total': 13024000, 'tokens/trainable': 252092, 'epoch': '1.696'}
 85%|████████▌ | 407/478 [1:49:05<17:41, 14.94s/it] 85%|████████▌ | 408/478 [1:49:19<17:26, 14.94s/it]                                                   {'loss': '0.105', 'grad_norm': '4.719', 'learning_rate': '1.31e-06', 'ppl': '1.111', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '19.23', 'tokens/total': 13056000, 'tokens/trainable': 252666, 'epoch': '1.7'}
 85%|████████▌ | 408/478 [1:49:19<17:26, 14.94s/it] 86%|████████▌ | 409/478 [1:49:34<17:11, 14.95s/it]                                                   {'loss': '0.1016', 'grad_norm': '5.094', 'learning_rate': '1.274e-06', 'ppl': '1.107', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.03', 'tokens/total': 13088000, 'tokens/trainable': 253294, 'epoch': '1.704'}
 86%|████████▌ | 409/478 [1:49:34<17:11, 14.95s/it] 86%|████████▌ | 410/478 [1:49:49<16:56, 14.95s/it]                                                   {'loss': '0.1023', 'grad_norm': '4.938', 'learning_rate': '1.238e-06', 'ppl': '1.108', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '20.95', 'tokens/total': 13120000, 'tokens/trainable': 253920, 'epoch': '1.708'}
 86%|████████▌ | 410/478 [1:49:49<16:56, 14.95s/it] 86%|████████▌ | 411/478 [1:50:04<16:42, 14.96s/it]                                                   {'loss': '0.1167', 'grad_norm': '5.812', 'learning_rate': '1.203e-06', 'ppl': '1.124', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '20.77', 'tokens/total': 13152000, 'tokens/trainable': 254541, 'epoch': '1.712'}
 86%|████████▌ | 411/478 [1:50:04<16:42, 14.96s/it] 86%|████████▌ | 412/478 [1:50:19<16:26, 14.95s/it]                                                   {'loss': '0.1777', 'grad_norm': '5.562', 'learning_rate': '1.169e-06', 'ppl': '1.195', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '22.07', 'tokens/total': 13184000, 'tokens/trainable': 255200, 'epoch': '1.717'}
 86%|████████▌ | 412/478 [1:50:19<16:26, 14.95s/it] 86%|████████▋ | 413/478 [1:50:34<16:12, 14.95s/it]                                                   {'loss': '0.1094', 'grad_norm': '4.25', 'learning_rate': '1.135e-06', 'ppl': '1.116', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '20.79', 'tokens/total': 13216000, 'tokens/trainable': 255821, 'epoch': '1.721'}
 86%|████████▋ | 413/478 [1:50:34<16:12, 14.95s/it] 87%|████████▋ | 414/478 [1:50:49<15:57, 14.96s/it]                                                   {'loss': '0.1216', 'grad_norm': '5', 'learning_rate': '1.102e-06', 'ppl': '1.129', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '24.83', 'tokens/total': 13248000, 'tokens/trainable': 256563, 'epoch': '1.725'}
 87%|████████▋ | 414/478 [1:50:49<15:57, 14.96s/it] 87%|████████▋ | 415/478 [1:51:04<15:42, 14.95s/it]                                                   {'loss': '0.1398', 'grad_norm': '5', 'learning_rate': '1.069e-06', 'ppl': '1.15', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21', 'tokens/total': 13280000, 'tokens/trainable': 257190, 'epoch': '1.729'}
 87%|████████▋ | 415/478 [1:51:04<15:42, 14.95s/it] 87%|████████▋ | 416/478 [1:51:19<15:26, 14.95s/it]                                                   {'loss': '0.1499', 'grad_norm': '5.219', 'learning_rate': '1.036e-06', 'ppl': '1.162', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '24.32', 'tokens/total': 13312000, 'tokens/trainable': 257915, 'epoch': '1.733'}
 87%|████████▋ | 416/478 [1:51:19<15:26, 14.95s/it] 87%|████████▋ | 417/478 [1:51:34<15:11, 14.95s/it]                                                   {'loss': '0.1335', 'grad_norm': '5.312', 'learning_rate': '1.004e-06', 'ppl': '1.143', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '20.39', 'tokens/total': 13344000, 'tokens/trainable': 258524, 'epoch': '1.738'}
 87%|████████▋ | 417/478 [1:51:34<15:11, 14.95s/it] 87%|████████▋ | 418/478 [1:51:49<14:56, 14.95s/it]                                                   {'loss': '0.1001', 'grad_norm': '3.969', 'learning_rate': '9.723e-07', 'ppl': '1.105', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '22.87', 'tokens/total': 13376000, 'tokens/trainable': 259207, 'epoch': '1.742'}
 87%|████████▋ | 418/478 [1:51:49<14:56, 14.95s/it] 88%|████████▊ | 419/478 [1:52:04<14:41, 14.95s/it]                                                   {'loss': '0.09399', 'grad_norm': '4.562', 'learning_rate': '9.412e-07', 'ppl': '1.099', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '17.58', 'tokens/total': 13408000, 'tokens/trainable': 259732, 'epoch': '1.746'}
 88%|████████▊ | 419/478 [1:52:04<14:41, 14.95s/it] 88%|████████▊ | 420/478 [1:52:19<14:26, 14.95s/it]                                                   {'loss': '0.08618', 'grad_norm': '4.438', 'learning_rate': '9.106e-07', 'ppl': '1.09', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '19.46', 'tokens/total': 13440000, 'tokens/trainable': 260313, 'epoch': '1.75'}
 88%|████████▊ | 420/478 [1:52:19<14:26, 14.95s/it] 88%|████████▊ | 421/478 [1:52:34<14:12, 14.95s/it]                                                   {'loss': '0.1147', 'grad_norm': '4.719', 'learning_rate': '8.804e-07', 'ppl': '1.122', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '20.85', 'tokens/total': 13472000, 'tokens/trainable': 260936, 'epoch': '1.754'}
 88%|████████▊ | 421/478 [1:52:34<14:12, 14.95s/it] 88%|████████▊ | 422/478 [1:52:49<13:57, 14.95s/it]                                                   {'loss': '0.08655', 'grad_norm': '4.156', 'learning_rate': '8.508e-07', 'ppl': '1.09', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '18.58', 'tokens/total': 13504000, 'tokens/trainable': 261491, 'epoch': '1.758'}
 88%|████████▊ | 422/478 [1:52:49<13:57, 14.95s/it] 88%|████████▊ | 423/478 [1:53:04<13:42, 14.95s/it]                                                   {'loss': '0.07715', 'grad_norm': '3.297', 'learning_rate': '8.216e-07', 'ppl': '1.08', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '19.03', 'tokens/total': 13536000, 'tokens/trainable': 262059, 'epoch': '1.762'}
 88%|████████▊ | 423/478 [1:53:04<13:42, 14.95s/it] 89%|████████▊ | 424/478 [1:53:19<13:27, 14.95s/it]                                                   {'loss': '0.1002', 'grad_norm': '4.531', 'learning_rate': '7.929e-07', 'ppl': '1.105', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.1', 'tokens/total': 13568000, 'tokens/trainable': 262689, 'epoch': '1.767'}
 89%|████████▊ | 424/478 [1:53:19<13:27, 14.95s/it] 89%|████████▉ | 425/478 [1:53:34<13:12, 14.95s/it]                                                   {'loss': '0.08423', 'grad_norm': '4.406', 'learning_rate': '7.647e-07', 'ppl': '1.088', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '17.76', 'tokens/total': 13600000, 'tokens/trainable': 263219, 'epoch': '1.771'}
 89%|████████▉ | 425/478 [1:53:34<13:12, 14.95s/it] 89%|████████▉ | 426/478 [1:53:49<12:57, 14.95s/it]                                                   {'loss': '0.1345', 'grad_norm': '6.062', 'learning_rate': '7.37e-07', 'ppl': '1.144', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '22.27', 'tokens/total': 13632000, 'tokens/trainable': 263884, 'epoch': '1.775'}
 89%|████████▉ | 426/478 [1:53:49<12:57, 14.95s/it] 89%|████████▉ | 427/478 [1:54:04<12:42, 14.95s/it]                                                   {'loss': '0.1025', 'grad_norm': '4.938', 'learning_rate': '7.098e-07', 'ppl': '1.108', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '20.23', 'tokens/total': 13664000, 'tokens/trainable': 264488, 'epoch': '1.779'}
 89%|████████▉ | 427/478 [1:54:04<12:42, 14.95s/it] 90%|████████▉ | 428/478 [1:54:18<12:27, 14.95s/it]                                                   {'loss': '0.07617', 'grad_norm': '3.781', 'learning_rate': '6.83e-07', 'ppl': '1.079', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.44', 'tokens/total': 13696000, 'tokens/trainable': 265128, 'epoch': '1.783'}
 90%|████████▉ | 428/478 [1:54:18<12:27, 14.95s/it] 90%|████████▉ | 429/478 [1:54:33<12:12, 14.95s/it]                                                   {'loss': '0.1064', 'grad_norm': '4.469', 'learning_rate': '6.568e-07', 'ppl': '1.112', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21', 'tokens/total': 13728000, 'tokens/trainable': 265755, 'epoch': '1.788'}
 90%|████████▉ | 429/478 [1:54:33<12:12, 14.95s/it] 90%|████████▉ | 430/478 [1:54:48<11:57, 14.95s/it]                                                   {'loss': '0.09375', 'grad_norm': '4.375', 'learning_rate': '6.311e-07', 'ppl': '1.098', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.77', 'tokens/total': 13760000, 'tokens/trainable': 266405, 'epoch': '1.792'}
 90%|████████▉ | 430/478 [1:54:48<11:57, 14.95s/it] 90%|█████████ | 431/478 [1:55:03<11:42, 14.95s/it]                                                   {'loss': '0.1077', 'grad_norm': '4.219', 'learning_rate': '6.058e-07', 'ppl': '1.114', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '23.04', 'tokens/total': 13792000, 'tokens/trainable': 267093, 'epoch': '1.796'}
 90%|█████████ | 431/478 [1:55:03<11:42, 14.95s/it] 90%|█████████ | 432/478 [1:55:18<11:27, 14.95s/it]                                                   {'loss': '0.1057', 'grad_norm': '4.594', 'learning_rate': '5.811e-07', 'ppl': '1.111', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.54', 'tokens/total': 13824000, 'tokens/trainable': 267736, 'epoch': '1.8'}
 90%|█████████ | 432/478 [1:55:18<11:27, 14.95s/it] 91%|█████████ | 433/478 [1:55:33<11:12, 14.95s/it]                                                   {'loss': '0.1113', 'grad_norm': '5.094', 'learning_rate': '5.569e-07', 'ppl': '1.118', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '22.17', 'tokens/total': 13856000, 'tokens/trainable': 268398, 'epoch': '1.804'}
 91%|█████████ | 433/478 [1:55:33<11:12, 14.95s/it] 91%|█████████ | 434/478 [1:55:48<10:56, 14.93s/it]                                                   {'loss': '0.1201', 'grad_norm': '4.969', 'learning_rate': '5.331e-07', 'ppl': '1.128', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '22.31', 'tokens/total': 13888000, 'tokens/trainable': 269061, 'epoch': '1.808'}
 91%|█████████ | 434/478 [1:55:48<10:56, 14.93s/it] 91%|█████████ | 435/478 [1:56:03<10:42, 14.93s/it]                                                   {'loss': '0.104', 'grad_norm': '4.406', 'learning_rate': '5.099e-07', 'ppl': '1.11', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '22.69', 'tokens/total': 13920000, 'tokens/trainable': 269738, 'epoch': '1.812'}
 91%|█████████ | 435/478 [1:56:03<10:42, 14.93s/it] 91%|█████████ | 436/478 [1:56:18<10:27, 14.93s/it]                                                   {'loss': '0.1233', 'grad_norm': '4.281', 'learning_rate': '4.872e-07', 'ppl': '1.131', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '22.51', 'tokens/total': 13952000, 'tokens/trainable': 270409, 'epoch': '1.817'}
 91%|█████████ | 436/478 [1:56:18<10:27, 14.93s/it] 91%|█████████▏| 437/478 [1:56:33<10:12, 14.93s/it]                                                   {'loss': '0.1187', 'grad_norm': '5.219', 'learning_rate': '4.65e-07', 'ppl': '1.126', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '19.09', 'tokens/total': 13984000, 'tokens/trainable': 270979, 'epoch': '1.821'}
 91%|█████████▏| 437/478 [1:56:33<10:12, 14.93s/it] 92%|█████████▏| 438/478 [1:56:48<09:57, 14.94s/it]                                                   {'loss': '0.1045', 'grad_norm': '5', 'learning_rate': '4.432e-07', 'ppl': '1.11', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.7', 'tokens/total': 14016000, 'tokens/trainable': 271627, 'epoch': '1.825'}
 92%|█████████▏| 438/478 [1:56:48<09:57, 14.94s/it] 92%|█████████▏| 439/478 [1:57:03<09:44, 15.00s/it]                                                   {'loss': '0.1008', 'grad_norm': '5', 'learning_rate': '4.22e-07', 'ppl': '1.106', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '19.12', 'tokens/total': 14048000, 'tokens/trainable': 272205, 'epoch': '1.829'}
 92%|█████████▏| 439/478 [1:57:03<09:44, 15.00s/it] 92%|█████████▏| 440/478 [1:57:18<09:29, 14.98s/it]                                                   {'loss': '0.1465', 'grad_norm': '5.156', 'learning_rate': '4.013e-07', 'ppl': '1.158', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '20.87', 'tokens/total': 14080000, 'tokens/trainable': 272828, 'epoch': '1.833'}
 92%|█████████▏| 440/478 [1:57:18<09:29, 14.98s/it] 92%|█████████▏| 441/478 [1:57:33<09:13, 14.97s/it]                                                   {'loss': '0.1189', 'grad_norm': '4.219', 'learning_rate': '3.812e-07', 'ppl': '1.126', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '20.87', 'tokens/total': 14112000, 'tokens/trainable': 273451, 'epoch': '1.837'}
 92%|█████████▏| 441/478 [1:57:33<09:13, 14.97s/it] 92%|█████████▏| 442/478 [1:57:48<08:58, 14.97s/it]                                                   {'loss': '0.09351', 'grad_norm': '4.188', 'learning_rate': '3.615e-07', 'ppl': '1.098', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '18.96', 'tokens/total': 14144000, 'tokens/trainable': 274017, 'epoch': '1.842'}
 92%|█████████▏| 442/478 [1:57:48<08:58, 14.97s/it] 93%|█████████▎| 443/478 [1:58:03<08:43, 14.96s/it]                                                   {'loss': '0.1255', 'grad_norm': '4.656', 'learning_rate': '3.423e-07', 'ppl': '1.134', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.1', 'tokens/total': 14176000, 'tokens/trainable': 274647, 'epoch': '1.846'}
 93%|█████████▎| 443/478 [1:58:03<08:43, 14.96s/it] 93%|█████████▎| 444/478 [1:58:18<08:28, 14.96s/it]                                                   {'loss': '0.1572', 'grad_norm': '5.25', 'learning_rate': '3.237e-07', 'ppl': '1.17', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '23.21', 'tokens/total': 14208000, 'tokens/trainable': 275340, 'epoch': '1.85'}
 93%|█████████▎| 444/478 [1:58:18<08:28, 14.96s/it] 93%|█████████▎| 445/478 [1:58:33<08:13, 14.95s/it]                                                   {'loss': '0.0918', 'grad_norm': '3.75', 'learning_rate': '3.055e-07', 'ppl': '1.096', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.47', 'tokens/total': 14240000, 'tokens/trainable': 275980, 'epoch': '1.854'}
 93%|█████████▎| 445/478 [1:58:33<08:13, 14.95s/it] 93%|█████████▎| 446/478 [1:58:48<07:58, 14.95s/it]                                                   {'loss': '0.07227', 'grad_norm': '3.5', 'learning_rate': '2.879e-07', 'ppl': '1.075', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.79', 'tokens/total': 14272000, 'tokens/trainable': 276631, 'epoch': '1.858'}
 93%|█████████▎| 446/478 [1:58:48<07:58, 14.95s/it] 94%|█████████▎| 447/478 [1:59:03<07:43, 14.95s/it]                                                   {'loss': '0.137', 'grad_norm': '4.969', 'learning_rate': '2.708e-07', 'ppl': '1.147', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '23.74', 'tokens/total': 14304000, 'tokens/trainable': 277340, 'epoch': '1.863'}
 94%|█████████▎| 447/478 [1:59:03<07:43, 14.95s/it] 94%|█████████▎| 448/478 [1:59:17<07:28, 14.95s/it]                                                   {'loss': '0.1152', 'grad_norm': '4.75', 'learning_rate': '2.542e-07', 'ppl': '1.122', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '19.96', 'tokens/total': 14336000, 'tokens/trainable': 277936, 'epoch': '1.867'}
 94%|█████████▎| 448/478 [1:59:17<07:28, 14.95s/it] 94%|█████████▍| 449/478 [1:59:32<07:12, 14.93s/it]                                                   {'loss': '0.167', 'grad_norm': '5.812', 'learning_rate': '2.381e-07', 'ppl': '1.182', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '23.15', 'tokens/total': 14368000, 'tokens/trainable': 278624, 'epoch': '1.871'}
 94%|█████████▍| 449/478 [1:59:32<07:12, 14.93s/it] 94%|█████████▍| 450/478 [1:59:47<06:58, 14.93s/it]                                                   {'loss': '0.09229', 'grad_norm': '4.312', 'learning_rate': '2.226e-07', 'ppl': '1.097', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.07', 'tokens/total': 14400000, 'tokens/trainable': 279253, 'epoch': '1.875'}
 94%|█████████▍| 450/478 [1:59:47<06:58, 14.93s/it] 94%|█████████▍| 451/478 [2:00:02<06:43, 14.93s/it]                                                   {'loss': '0.0874', 'grad_norm': '3.5', 'learning_rate': '2.076e-07', 'ppl': '1.091', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '22.39', 'tokens/total': 14432000, 'tokens/trainable': 279921, 'epoch': '1.879'}
 94%|█████████▍| 451/478 [2:00:02<06:43, 14.93s/it] 95%|█████████▍| 452/478 [2:00:17<06:27, 14.92s/it]                                                   {'loss': '0.1311', 'grad_norm': '4.469', 'learning_rate': '1.93e-07', 'ppl': '1.14', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.63', 'tokens/total': 14464000, 'tokens/trainable': 280564, 'epoch': '1.883'}
 95%|█████████▍| 452/478 [2:00:17<06:27, 14.92s/it] 95%|█████████▍| 453/478 [2:00:32<06:13, 14.93s/it]                                                   {'loss': '0.1213', 'grad_norm': '4.562', 'learning_rate': '1.79e-07', 'ppl': '1.129', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '22.21', 'tokens/total': 14496000, 'tokens/trainable': 281227, 'epoch': '1.887'}
 95%|█████████▍| 453/478 [2:00:32<06:13, 14.93s/it] 95%|█████████▍| 454/478 [2:00:47<05:58, 14.94s/it]                                                   {'loss': '0.1208', 'grad_norm': '5.375', 'learning_rate': '1.656e-07', 'ppl': '1.128', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '16.85', 'tokens/total': 14528000, 'tokens/trainable': 281731, 'epoch': '1.892'}
 95%|█████████▍| 454/478 [2:00:47<05:58, 14.94s/it] 95%|█████████▌| 455/478 [2:01:02<05:43, 14.94s/it]                                                   {'loss': '0.1101', 'grad_norm': '4.938', 'learning_rate': '1.526e-07', 'ppl': '1.116', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '20.3', 'tokens/total': 14560000, 'tokens/trainable': 282337, 'epoch': '1.896'}
 95%|█████████▌| 455/478 [2:01:02<05:43, 14.94s/it] 95%|█████████▌| 456/478 [2:01:17<05:28, 14.94s/it]                                                   {'loss': '0.1323', 'grad_norm': '4.969', 'learning_rate': '1.402e-07', 'ppl': '1.141', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.47', 'tokens/total': 14592000, 'tokens/trainable': 282978, 'epoch': '1.9'}
 95%|█████████▌| 456/478 [2:01:17<05:28, 14.94s/it] 96%|█████████▌| 457/478 [2:01:32<05:13, 14.94s/it]                                                   {'loss': '0.1028', 'grad_norm': '4.594', 'learning_rate': '1.283e-07', 'ppl': '1.108', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.3', 'tokens/total': 14624000, 'tokens/trainable': 283614, 'epoch': '1.904'}
 96%|█████████▌| 457/478 [2:01:32<05:13, 14.94s/it] 96%|█████████▌| 458/478 [2:01:47<04:58, 14.94s/it]                                                   {'loss': '0.1025', 'grad_norm': '4.156', 'learning_rate': '1.169e-07', 'ppl': '1.108', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '22.45', 'tokens/total': 14656000, 'tokens/trainable': 284283, 'epoch': '1.908'}
 96%|█████████▌| 458/478 [2:01:47<04:58, 14.94s/it] 96%|█████████▌| 459/478 [2:02:02<04:43, 14.94s/it]                                                   {'loss': '0.0918', 'grad_norm': '4.281', 'learning_rate': '1.061e-07', 'ppl': '1.096', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '22.04', 'tokens/total': 14688000, 'tokens/trainable': 284941, 'epoch': '1.913'}
 96%|█████████▌| 459/478 [2:02:02<04:43, 14.94s/it] 96%|█████████▌| 460/478 [2:02:17<04:28, 14.94s/it]                                                   {'loss': '0.08789', 'grad_norm': '4.656', 'learning_rate': '9.575e-08', 'ppl': '1.092', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '22.68', 'tokens/total': 14720000, 'tokens/trainable': 285618, 'epoch': '1.917'}
 96%|█████████▌| 460/478 [2:02:17<04:28, 14.94s/it] 96%|█████████▋| 461/478 [2:02:32<04:13, 14.93s/it]                                                   {'loss': '0.1421', 'grad_norm': '5.469', 'learning_rate': '8.595e-08', 'ppl': '1.153', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '23.9', 'tokens/total': 14752000, 'tokens/trainable': 286330, 'epoch': '1.921'}
 96%|█████████▋| 461/478 [2:02:32<04:13, 14.93s/it] 97%|█████████▋| 462/478 [2:02:47<03:58, 14.94s/it]                                                   {'loss': '0.1292', 'grad_norm': '4.75', 'learning_rate': '7.668e-08', 'ppl': '1.138', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '20.47', 'tokens/total': 14784000, 'tokens/trainable': 286941, 'epoch': '1.925'}
 97%|█████████▋| 462/478 [2:02:47<03:58, 14.94s/it] 97%|█████████▋| 463/478 [2:03:02<03:46, 15.13s/it]                                                   {'loss': '0.1533', 'grad_norm': '5.406', 'learning_rate': '6.793e-08', 'ppl': '1.166', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.2', 'tokens/total': 14816000, 'tokens/trainable': 287574, 'epoch': '1.929'}
 97%|█████████▋| 463/478 [2:03:02<03:46, 15.13s/it] 97%|█████████▋| 464/478 [2:03:17<03:31, 15.08s/it]                                                   {'loss': '0.09521', 'grad_norm': '3.891', 'learning_rate': '5.971e-08', 'ppl': '1.1', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '20.9', 'tokens/total': 14848000, 'tokens/trainable': 288198, 'epoch': '1.933'}
 97%|█████████▋| 464/478 [2:03:17<03:31, 15.08s/it] 97%|█████████▋| 465/478 [2:03:32<03:15, 15.04s/it]                                                   {'loss': '0.1111', 'grad_norm': '4.344', 'learning_rate': '5.202e-08', 'ppl': '1.117', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.87', 'tokens/total': 14880000, 'tokens/trainable': 288851, 'epoch': '1.938'}
 97%|█████████▋| 465/478 [2:03:32<03:15, 15.04s/it] 97%|█████████▋| 466/478 [2:03:47<03:00, 15.07s/it]                                                   {'loss': '0.1223', 'grad_norm': '5.156', 'learning_rate': '4.486e-08', 'ppl': '1.13', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '19.42', 'tokens/total': 14912000, 'tokens/trainable': 289439, 'epoch': '1.942'}
 97%|█████████▋| 466/478 [2:03:47<03:00, 15.07s/it] 98%|█████████▊| 467/478 [2:04:02<02:45, 15.04s/it]                                                   {'loss': '0.1265', 'grad_norm': '4.75', 'learning_rate': '3.823e-08', 'ppl': '1.135', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '22.47', 'tokens/total': 14944000, 'tokens/trainable': 290110, 'epoch': '1.946'}
 98%|█████████▊| 467/478 [2:04:02<02:45, 15.04s/it] 98%|█████████▊| 468/478 [2:04:17<02:30, 15.01s/it]                                                   {'loss': '0.1294', 'grad_norm': '5.344', 'learning_rate': '3.213e-08', 'ppl': '1.138', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.03', 'tokens/total': 14976000, 'tokens/trainable': 290738, 'epoch': '1.95'}
 98%|█████████▊| 468/478 [2:04:17<02:30, 15.01s/it] 98%|█████████▊| 469/478 [2:04:32<02:14, 14.99s/it]                                                   {'loss': '0.1489', 'grad_norm': '5.562', 'learning_rate': '2.655e-08', 'ppl': '1.161', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '17.81', 'tokens/total': 15008000, 'tokens/trainable': 291270, 'epoch': '1.954'}
 98%|█████████▊| 469/478 [2:04:32<02:14, 14.99s/it] 98%|█████████▊| 470/478 [2:04:47<01:59, 14.98s/it]                                                   {'loss': '0.1387', 'grad_norm': '5.312', 'learning_rate': '2.151e-08', 'ppl': '1.149', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '20.5', 'tokens/total': 15040000, 'tokens/trainable': 291882, 'epoch': '1.958'}
 98%|█████████▊| 470/478 [2:04:47<01:59, 14.98s/it] 99%|█████████▊| 471/478 [2:05:02<01:44, 14.97s/it]                                                   {'loss': '0.1108', 'grad_norm': '4.594', 'learning_rate': '1.7e-08', 'ppl': '1.117', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.23', 'tokens/total': 15072000, 'tokens/trainable': 292516, 'epoch': '1.962'}
 99%|█████████▊| 471/478 [2:05:02<01:44, 14.97s/it] 99%|█████████▊| 472/478 [2:05:17<01:29, 14.96s/it]                                                   {'loss': '0.1416', 'grad_norm': '5.562', 'learning_rate': '1.301e-08', 'ppl': '1.152', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '17.82', 'tokens/total': 15104000, 'tokens/trainable': 293048, 'epoch': '1.967'}
 99%|█████████▊| 472/478 [2:05:17<01:29, 14.96s/it] 99%|█████████▉| 473/478 [2:05:32<01:14, 14.96s/it]                                                   {'loss': '0.1084', 'grad_norm': '4.75', 'learning_rate': '9.562e-09', 'ppl': '1.114', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '19.33', 'tokens/total': 15136000, 'tokens/trainable': 293625, 'epoch': '1.971'}
 99%|█████████▉| 473/478 [2:05:32<01:14, 14.96s/it] 99%|█████████▉| 474/478 [2:05:47<00:59, 14.95s/it]                                                   {'loss': '0.1045', 'grad_norm': '4.344', 'learning_rate': '6.641e-09', 'ppl': '1.11', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '23.11', 'tokens/total': 15168000, 'tokens/trainable': 294314, 'epoch': '1.975'}
 99%|█████████▉| 474/478 [2:05:47<00:59, 14.95s/it] 99%|█████████▉| 475/478 [2:06:02<00:44, 14.95s/it]                                                   {'loss': '0.1055', 'grad_norm': '4.812', 'learning_rate': '4.25e-09', 'ppl': '1.111', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '20.31', 'tokens/total': 15200000, 'tokens/trainable': 294920, 'epoch': '1.979'}
 99%|█████████▉| 475/478 [2:06:02<00:44, 14.95s/it]100%|█████████▉| 476/478 [2:06:17<00:29, 14.95s/it]                                                   {'loss': '0.1462', 'grad_norm': '5.719', 'learning_rate': '2.391e-09', 'ppl': '1.157', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '22.54', 'tokens/total': 15232000, 'tokens/trainable': 295593, 'epoch': '1.983'}
100%|█████████▉| 476/478 [2:06:17<00:29, 14.95s/it]100%|█████████▉| 477/478 [2:06:32<00:14, 14.95s/it]                                                   {'loss': '0.1807', 'grad_norm': '6.406', 'learning_rate': '1.063e-09', 'ppl': '1.198', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.9', 'tokens/total': 15264000, 'tokens/trainable': 296247, 'epoch': '1.988'}
100%|█████████▉| 477/478 [2:06:32<00:14, 14.95s/it]100%|██████████| 478/478 [2:06:47<00:00, 14.95s/it]                                                   {'loss': '0.1331', 'grad_norm': '5.906', 'learning_rate': '2.657e-10', 'ppl': '1.142', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '19.63', 'tokens/total': 15296000, 'tokens/trainable': 296833, 'epoch': '1.992'}
100%|██████████| 478/478 [2:06:47<00:00, 14.95s/it][2026-04-17 04:16:11,186] [INFO] [axolotl.core.trainers.base] Running evaluation step...
[2026-04-17 04:16:19,065] [INFO] [axolotl.utils.samplers.multipack] gather_len_batches: [54, 54]

  0%|          | 0/27 [00:00<?, ?it/s]
  7%|▋         | 2/27 [00:02<00:35,  1.41s/it]
 11%|█         | 3/27 [00:05<00:47,  1.96s/it]
 15%|█▍        | 4/27 [00:08<00:51,  2.26s/it]
 19%|█▊        | 5/27 [00:11<00:53,  2.43s/it]
 22%|██▏       | 6/27 [00:13<00:53,  2.54s/it]
 26%|██▌       | 7/27 [00:16<00:52,  2.61s/it]
 30%|██▉       | 8/27 [00:19<00:50,  2.65s/it]
 33%|███▎      | 9/27 [00:22<00:48,  2.68s/it]
 37%|███▋      | 10/27 [00:24<00:45,  2.70s/it]
 41%|████      | 11/27 [00:27<00:43,  2.72s/it]
 44%|████▍     | 12/27 [00:30<00:39,  2.64s/it]
 48%|████▊     | 13/27 [00:33<00:38,  2.76s/it]
 52%|█████▏    | 14/27 [00:35<00:35,  2.76s/it]
 56%|█████▌    | 15/27 [00:38<00:33,  2.76s/it]
 59%|█████▉    | 16/27 [00:41<00:30,  2.76s/it]
 63%|██████▎   | 17/27 [00:44<00:27,  2.75s/it]
 67%|██████▋   | 18/27 [00:46<00:24,  2.75s/it]
 70%|███████   | 19/27 [00:49<00:22,  2.75s/it]
 74%|███████▍  | 20/27 [00:52<00:19,  2.75s/it]
 78%|███████▊  | 21/27 [00:54<00:16,  2.67s/it]
 81%|████████▏ | 22/27 [00:57<00:13,  2.78s/it]
 85%|████████▌ | 23/27 [01:00<00:11,  2.77s/it]
 89%|████████▉ | 24/27 [01:03<00:08,  2.77s/it]
 93%|█████████▎| 25/27 [01:06<00:05,  2.76s/it]
 96%|█████████▋| 26/27 [01:08<00:02,  2.76s/it]
100%|██████████| 27/27 [01:11<00:00,  2.79s/it]                                                   
                                               {'eval_loss': '0.2227', 'eval_runtime': '75.1', 'eval_samples_per_second': '2.783', 'eval_steps_per_second': '1.398', 'eval_ppl': '1.249', 'memory/max_active (GiB)': '34.91', 'memory/max_allocated (GiB)': '34.91', 'memory/device_reserved (GiB)': '57.25', 'epoch': '1.992', 'tokens/train_per_sec_per_gpu': '0'}
100%|██████████| 478/478 [2:08:10<00:00, 14.95s/it]
100%|██████████| 27/27 [01:13<00:00,  2.79s/it]
                                               [2026-04-17 04:17:40,668] [INFO] [axolotl.core.trainers.base] Saving model checkpoint to /workspace/data/outputs/qwen3-4B/fft_magnifi-module-classifier-04-17-relabelled-upsampled/checkpoint-478

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]
Writing model shards: 100%|██████████| 1/1 [00:20<00:00, 20.91s/it]Writing model shards: 100%|██████████| 1/1 [00:20<00:00, 20.91s/it]
                                                   {'train_runtime': '7816', 'train_samples_per_second': '0.122', 'train_steps_per_second': '0.061', 'train_loss': '0.1648', 'memory/max_active (GiB)': '12.01', 'memory/max_allocated (GiB)': '12.01', 'memory/device_reserved (GiB)': '12.11', 'epoch': '1.992', 'tokens/train_per_sec_per_gpu': '0'}
100%|██████████| 478/478 [2:10:13<00:00, 14.95s/it]100%|██████████| 478/478 [2:10:13<00:00, 16.35s/it]
[2026-04-17 04:20:34,131] [INFO] [axolotl.train] Training completed! Saving trained model to /workspace/data/outputs/qwen3-4B/fft_magnifi-module-classifier-04-17-relabelled-upsampled/.
[2026-04-17 04:20:39,898] [INFO] [axolotl.core.trainers.base] Saving model checkpoint to /workspace/data/outputs/qwen3-4B/fft_magnifi-module-classifier-04-17-relabelled-upsampled/
Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]Writing model shards: 100%|██████████| 1/1 [00:22<00:00, 22.07s/it]Writing model shards: 100%|██████████| 1/1 [00:22<00:00, 22.07s/it]
[2026-04-17 04:21:08,433] [INFO] [axolotl.core.trainers.base] Saving model checkpoint to /workspace/data/outputs/qwen3-4B/fft_magnifi-module-classifier-04-17-relabelled-upsampled/
Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]Writing model shards: 100%|██████████| 1/1 [00:20<00:00, 20.34s/it]Writing model shards: 100%|██████████| 1/1 [00:20<00:00, 20.35s/it]
Processing Files (0 / 0)      : |          |  0.00B /  0.00B            
New Data Upload               : |          |  0.00B /  0.00B            

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:   2%|▏         |  184MB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:   2%|▏         |  184MB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :   2%|▏         |  195MB / 8.83GB,   ???B/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:   4%|▍         |  336MB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :   4%|▍         |  347MB / 8.83GB,  758MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:   5%|▌         |  464MB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :   5%|▌         |  475MB / 8.83GB,  701MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:   7%|▋         |  576MB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :   7%|▋         |  587MB / 8.83GB,  653MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:   8%|▊         |  680MB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :   8%|▊         |  691MB / 8.83GB,  620MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  10%|█         |  912MB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  11%|█         |  976MB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  13%|█▎        | 1.12GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  13%|█▎        | 1.13GB / 8.83GB,  668MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  14%|█▍        | 1.22GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  14%|█▍        | 1.23GB / 8.83GB,  645MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  15%|█▍        | 1.30GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  15%|█▍        | 1.32GB / 8.83GB,  622MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  16%|█▌        | 1.40GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  16%|█▌        | 1.41GB / 8.83GB,  608MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  17%|█▋        | 1.51GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  17%|█▋        | 1.52GB / 8.83GB,  604MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  18%|█▊        | 1.61GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  18%|█▊        | 1.62GB / 8.83GB,  593MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  19%|█▉        | 1.70GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  19%|█▉        | 1.72GB / 8.83GB,  585MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  21%|██        | 1.82GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  21%|██        | 1.83GB / 8.83GB,  583MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  22%|██▏       | 1.92GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  22%|██▏       | 1.93GB / 8.83GB,  579MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  23%|██▎       | 2.02GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  23%|██▎       | 2.04GB / 8.83GB,  575MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  24%|██▍       | 2.11GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  24%|██▍       | 2.12GB / 8.83GB,  567MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  25%|██▌       | 2.21GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  25%|██▌       | 2.22GB / 8.83GB,  562MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  26%|██▌       | 2.30GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  26%|██▌       | 2.32GB / 8.83GB,  558MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  27%|██▋       | 2.41GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  27%|██▋       | 2.42GB / 8.83GB,  556MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  28%|██▊       | 2.50GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  28%|██▊       | 2.52GB / 8.83GB,  552MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  30%|██▉       | 2.62GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  30%|██▉       | 2.63GB / 8.83GB,  553MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  31%|███       | 2.71GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  31%|███       | 2.72GB / 8.83GB,  550MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  32%|███▏      | 2.82GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  32%|███▏      | 2.83GB / 8.83GB,  548MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  33%|███▎      | 2.91GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  33%|███▎      | 2.92GB / 8.83GB,  546MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  34%|███▍      | 3.01GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  34%|███▍      | 3.02GB / 8.83GB,  543MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  35%|███▌      | 3.11GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  35%|███▌      | 3.12GB / 8.83GB,  542MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  37%|███▋      | 3.22GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  37%|███▋      | 3.24GB / 8.83GB,  543MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  38%|███▊      | 3.33GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  38%|███▊      | 3.34GB / 8.83GB,  542MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  39%|███▉      | 3.43GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  39%|███▉      | 3.44GB / 8.83GB,  541MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  40%|████      | 3.54GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  40%|████      | 3.56GB / 8.83GB,  542MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  41%|████▏     | 3.65GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  41%|████▏     | 3.66GB / 8.83GB,  541MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  43%|████▎     | 3.76GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  43%|████▎     | 3.77GB / 8.83GB,  542MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  44%|████▍     | 3.86GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  44%|████▍     | 3.88GB / 8.83GB,  541MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  45%|████▌     | 3.98GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  45%|████▌     | 4.00GB / 8.83GB,  543MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  47%|████▋     | 4.10GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  47%|████▋     | 4.12GB / 8.83GB,  544MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  48%|████▊     | 4.22GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  48%|████▊     | 4.23GB / 8.83GB,  545MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  49%|████▉     | 4.34GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  49%|████▉     | 4.36GB / 8.83GB,  547MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  51%|█████     | 4.46GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  51%|█████     | 4.48GB / 8.83GB,  549MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  52%|█████▏    | 4.58GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  52%|█████▏    | 4.59GB / 8.83GB,  549MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  53%|█████▎    | 4.70GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  53%|█████▎    | 4.71GB / 8.83GB,  550MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  54%|█████▍    | 4.81GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  55%|█████▍    | 4.82GB / 8.83GB,  550MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  56%|█████▌    | 4.93GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  56%|█████▌    | 4.94GB / 8.83GB,  552MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  57%|█████▋    | 5.05GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  57%|█████▋    | 5.06GB / 8.83GB,  553MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  58%|█████▊    | 5.16GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  59%|█████▊    | 5.17GB / 8.83GB,  553MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  60%|█████▉    | 5.28GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  60%|█████▉    | 5.29GB / 8.83GB,  554MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  61%|██████    | 5.39GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  61%|██████    | 5.40GB / 8.83GB,  554MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  62%|██████▏   | 5.51GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  63%|██████▎   | 5.52GB / 8.83GB,  555MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  64%|██████▎   | 5.62GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  64%|██████▍   | 5.64GB / 8.83GB,  555MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  65%|██████▌   | 5.74GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  65%|██████▌   | 5.76GB / 8.83GB,  556MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  66%|██████▋   | 5.86GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  67%|██████▋   | 5.88GB / 8.83GB,  557MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  68%|██████▊   | 5.98GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  68%|██████▊   | 5.99GB / 8.83GB,  553MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  69%|██████▉   | 6.10GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  69%|██████▉   | 6.11GB / 8.83GB,  552MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  70%|███████   | 6.22GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  70%|███████   | 6.23GB / 8.83GB,  553MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  72%|███████▏  | 6.34GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  72%|███████▏  | 6.35GB / 8.83GB,  554MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  73%|███████▎  | 6.46GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  73%|███████▎  | 6.47GB / 8.83GB,  550MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  75%|███████▍  | 6.58GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  75%|███████▍  | 6.59GB / 8.83GB,  549MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  76%|███████▌  | 6.70GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  76%|███████▌  | 6.71GB / 8.83GB,  547MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  77%|███████▋  | 6.81GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  77%|███████▋  | 6.82GB / 8.83GB,  548MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  79%|███████▊  | 6.93GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  79%|███████▊  | 6.94GB / 8.83GB,  551MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  80%|███████▉  | 7.05GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  80%|███████▉  | 7.06GB / 8.83GB,  554MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  81%|████████▏ | 7.18GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  81%|████████▏ | 7.19GB / 8.83GB,  555MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  83%|████████▎ | 7.29GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  83%|████████▎ | 7.30GB / 8.83GB,  557MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  84%|████████▍ | 7.42GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  84%|████████▍ | 7.43GB / 8.83GB,  560MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  85%|████████▌ | 7.53GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  85%|████████▌ | 7.54GB / 8.83GB,  560MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  87%|████████▋ | 7.65GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  87%|████████▋ | 7.66GB / 8.83GB,  562MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  88%|████████▊ | 7.77GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  88%|████████▊ | 7.78GB / 8.83GB,  563MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  89%|████████▉ | 7.89GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  89%|████████▉ | 7.90GB / 8.83GB,  566MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  91%|█████████ | 8.01GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  91%|█████████ | 8.02GB / 8.83GB,  569MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  92%|█████████▏| 8.13GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  92%|█████████▏| 8.14GB / 8.83GB,  571MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  93%|█████████▎| 8.24GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  93%|█████████▎| 8.25GB / 8.83GB,  572MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  95%|█████████▌| 8.38GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  95%|█████████▌| 8.40GB / 8.83GB,  576MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  97%|█████████▋| 8.52GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  97%|█████████▋| 8.53GB / 8.83GB,  579MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  98%|█████████▊| 8.63GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  98%|█████████▊| 8.64GB / 8.83GB,  580MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  99%|█████████▉| 8.75GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  99%|█████████▉| 8.76GB / 8.83GB,  582MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors: 100%|██████████| 8.82GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (3 / 3)      : 100%|██████████| 8.83GB / 8.83GB,  580MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors: 100%|██████████| 8.82GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors: 100%|██████████| 8.82GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors: 100%|██████████| 8.82GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors: 100%|██████████| 8.82GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors: 100%|██████████| 8.82GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors: 100%|██████████| 8.82GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors: 100%|██████████| 8.82GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors: 100%|██████████| 8.82GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (3 / 3)      : 100%|██████████| 8.83GB / 8.83GB,  506MB/s  
New Data Upload               : |          |  0.00B /  0.00B,  0.00B/s  
  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
  ...sampled/model.safetensors: 100%|██████████| 8.82GB / 8.82GB            
  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            
[2026-04-17 04:21:54,742] [INFO] [axolotl.train] Model successfully saved to /workspace/data/outputs/qwen3-4B/fft_magnifi-module-classifier-04-17-relabelled-upsampled/
[2026-04-17 04:22:00,408] [INFO] [axolotl.core.trainers.base] Saving model checkpoint to /workspace/data/outputs/qwen3-4B/fft_magnifi-module-classifier-04-17-relabelled-upsampled/
Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]Writing model shards: 100%|██████████| 1/1 [00:22<00:00, 22.09s/it]Writing model shards: 100%|██████████| 1/1 [00:22<00:00, 22.09s/it]
Processing Files (0 / 0)      : |          |  0.00B /  0.00B            
New Data Upload               : |          |  0.00B /  0.00B            

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:   2%|▏         |  160MB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:   2%|▏         |  160MB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :   2%|▏         |  171MB / 8.83GB,   ???B/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:   3%|▎         |  248MB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :   3%|▎         |  259MB / 8.83GB,  440MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:   4%|▍         |  352MB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :   4%|▍         |  363MB / 8.83GB,  480MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:   5%|▌         |  464MB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :   5%|▌         |  475MB / 8.83GB,  507MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:   7%|▋         |  576MB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :   7%|▋         |  587MB / 8.83GB,  520MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:   8%|▊         |  680MB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :   8%|▊         |  691MB / 8.83GB,  520MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:   9%|▉         |  784MB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :   9%|▉         |  795MB / 8.83GB,  520MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  10%|█         |  888MB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  10%|█         |  899MB / 8.83GB,  520MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  11%|█         |  984MB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  11%|█▏        |  995MB / 8.83GB,  515MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  12%|█▏        | 1.09GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  12%|█▏        | 1.10GB / 8.83GB,  515MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  13%|█▎        | 1.18GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  14%|█▎        | 1.20GB / 8.83GB,  512MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  15%|█▍        | 1.29GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  15%|█▍        | 1.30GB / 8.83GB,  513MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  16%|█▌        | 1.39GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  16%|█▌        | 1.40GB / 8.83GB,  513MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  17%|█▋        | 1.50GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  17%|█▋        | 1.51GB / 8.83GB,  514MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  18%|█▊        | 1.61GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  18%|█▊        | 1.62GB / 8.83GB,  517MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  19%|█▉        | 1.72GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  20%|█▉        | 1.73GB / 8.83GB,  520MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  21%|██        | 1.82GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  21%|██        | 1.83GB / 8.83GB,  517MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  22%|██▏       | 1.91GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  22%|██▏       | 1.92GB / 8.83GB,  515MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  23%|██▎       | 2.01GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  23%|██▎       | 2.02GB / 8.83GB,  513MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  24%|██▍       | 2.11GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  24%|██▍       | 2.12GB / 8.83GB,  514MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  25%|██▌       | 2.21GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  25%|██▌       | 2.22GB / 8.83GB,  512MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  26%|██▌       | 2.30GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  26%|██▌       | 2.32GB / 8.83GB,  511MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  27%|██▋       | 2.40GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  27%|██▋       | 2.41GB / 8.83GB,  509MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  28%|██▊       | 2.50GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  28%|██▊       | 2.52GB / 8.83GB,  510MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  29%|██▉       | 2.60GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  30%|██▉       | 2.61GB / 8.83GB,  508MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  31%|███       | 2.72GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  31%|███       | 2.73GB / 8.83GB,  512MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  32%|███▏      | 2.84GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  32%|███▏      | 2.85GB / 8.83GB,  515MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  33%|███▎      | 2.94GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  33%|███▎      | 2.96GB / 8.83GB,  516MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  35%|███▍      | 3.06GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  35%|███▍      | 3.07GB / 8.83GB,  517MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  36%|███▌      | 3.16GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  36%|███▌      | 3.17GB / 8.83GB,  517MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  37%|███▋      | 3.27GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  37%|███▋      | 3.28GB / 8.83GB,  519MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  38%|███▊      | 3.38GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  38%|███▊      | 3.39GB / 8.83GB,  519MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  40%|███▉      | 3.49GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  40%|███▉      | 3.50GB / 8.83GB,  520MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  41%|████      | 3.60GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  41%|████      | 3.61GB / 8.83GB,  521MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  42%|████▏     | 3.73GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  42%|████▏     | 3.74GB / 8.83GB,  525MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  44%|████▎     | 3.84GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  44%|████▎     | 3.85GB / 8.83GB,  526MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  45%|████▍     | 3.96GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  45%|████▍     | 3.97GB / 8.83GB,  528MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  46%|████▌     | 4.07GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  46%|████▌     | 4.08GB / 8.83GB,  529MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  48%|████▊     | 4.19GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  48%|████▊     | 4.20GB / 8.83GB,  531MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  49%|████▉     | 4.34GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  49%|████▉     | 4.36GB / 8.83GB,  536MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  51%|█████     | 4.46GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  51%|█████     | 4.47GB / 8.83GB,  537MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  52%|█████▏    | 4.58GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  52%|█████▏    | 4.59GB / 8.83GB,  539MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  53%|█████▎    | 4.69GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  53%|█████▎    | 4.70GB / 8.83GB,  539MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  55%|█████▍    | 4.82GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  55%|█████▍    | 4.83GB / 8.83GB,  541MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  56%|█████▌    | 4.94GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  56%|█████▌    | 4.95GB / 8.83GB,  543MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  57%|█████▋    | 5.06GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  57%|█████▋    | 5.07GB / 8.83GB,  544MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  59%|█████▊    | 5.17GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  59%|█████▊    | 5.18GB / 8.83GB,  544MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  60%|█████▉    | 5.29GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  60%|█████▉    | 5.30GB / 8.83GB,  546MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  61%|██████    | 5.40GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  61%|██████▏   | 5.41GB / 8.83GB,  546MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  63%|██████▎   | 5.54GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  63%|██████▎   | 5.55GB / 8.83GB,  549MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  64%|██████▍   | 5.66GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  64%|██████▍   | 5.68GB / 8.83GB,  550MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  66%|██████▌   | 5.78GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  66%|██████▌   | 5.80GB / 8.83GB,  551MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  67%|██████▋   | 5.90GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  67%|██████▋   | 5.92GB / 8.83GB,  555MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  68%|██████▊   | 6.02GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  68%|██████▊   | 6.03GB / 8.83GB,  555MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  70%|██████▉   | 6.14GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  70%|██████▉   | 6.15GB / 8.83GB,  556MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  71%|███████   | 6.26GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  71%|███████   | 6.27GB / 8.83GB,  557MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  72%|███████▏  | 6.38GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  72%|███████▏  | 6.39GB / 8.83GB,  558MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  74%|███████▎  | 6.49GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  74%|███████▎  | 6.50GB / 8.83GB,  559MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  75%|███████▍  | 6.61GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  75%|███████▍  | 6.62GB / 8.83GB,  561MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  76%|███████▌  | 6.72GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  76%|███████▌  | 6.73GB / 8.83GB,  562MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  78%|███████▊  | 6.84GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  78%|███████▊  | 6.85GB / 8.83GB,  564MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  79%|███████▉  | 6.95GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  79%|███████▉  | 6.96GB / 8.83GB,  566MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  80%|████████  | 7.06GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  80%|████████  | 7.08GB / 8.83GB,  566MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  82%|████████▏ | 7.19GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  82%|████████▏ | 7.20GB / 8.83GB,  569MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  90%|█████████ | 7.98GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  91%|█████████ | 7.99GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  91%|█████████ | 7.99GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  91%|█████████ | 8.00GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  92%|█████████▏| 8.12GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  92%|█████████▏| 8.13GB / 8.83GB,  609MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  94%|█████████▎| 8.26GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  94%|█████████▎| 8.27GB / 8.83GB,  613MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  95%|█████████▌| 8.41GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  95%|█████████▌| 8.42GB / 8.83GB,  617MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  97%|█████████▋| 8.53GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  97%|█████████▋| 8.54GB / 8.83GB,  620MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  98%|█████████▊| 8.64GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  98%|█████████▊| 8.65GB / 8.83GB,  621MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors:  99%|█████████▉| 8.76GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (2 / 3)      :  99%|█████████▉| 8.77GB / 8.83GB,  624MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors: 100%|██████████| 8.82GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (3 / 3)      : 100%|██████████| 8.83GB / 8.83GB,  620MB/s  

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors: 100%|██████████| 8.82GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors: 100%|██████████| 8.82GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors: 100%|██████████| 8.82GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors: 100%|██████████| 8.82GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            

  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            


  ...sampled/model.safetensors: 100%|██████████| 8.82GB / 8.82GB            


  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB            Processing Files (3 / 3)      : 100%|██████████| 8.83GB / 8.83GB,  577MB/s  
New Data Upload               : |          |  0.00B /  0.00B,  0.00B/s  
  ...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB            
  ...sampled/model.safetensors: 100%|██████████| 8.82GB / 8.82GB            
  ...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB