Files

2397 lines
316 KiB
Plaintext
Raw Permalink Normal View History

The following values were not passed to `accelerate launch` and had defaults used instead:
`--num_processes` was set to a value of `2`
More than one GPU was found, enabling multi-GPU training.
If this was unintended please pass in `--num_processes=1`.
`--num_machines` was set to a value of `1`
`--mixed_precision` was set to a value of `'no'`
`--dynamo_backend` was set to a value of `'no'`
To avoid this warning pass in values for each of the problematic parameters or run `accelerate config`.
[2026-04-17 02:08:45,271] [WARNING] [torchao] Skipping import of cpp extensions due to incompatible torch version. Please upgrade to torch >= 2.11.0 (found 2.10.0+cu128).
[2026-04-17 02:08:45,439] [WARNING] [torchao] Skipping import of cpp extensions due to incompatible torch version. Please upgrade to torch >= 2.11.0 (found 2.10.0+cu128).
[2026-04-17 02:08:47,222] [WARNING] [axolotl.utils.schemas.validation] sample_packing without flash, sdp, xformers, sage, or flex attention does not handle cross sample decontamination.
[2026-04-17 02:08:47,223] [INFO] [axolotl.utils.schemas.validation] Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing
[2026-04-17 02:08:47,223] [WARNING] [axolotl.utils.schemas.validation] Configuring FSDP fields with the `fsdp_` prefix is deprecated. Please omit the `fsdp_` prefix from the any fields in `fsdp_config`.
[2026-04-17 02:08:47,467] [INFO] [axolotl.cli.config] config:
{
"activation_offloading": false,
"axolotl_config_path": "/workspace/data/sage-classifier-train-scripts/qwen3/fft/qwen3-4B-train-v1-6-no-liger-flex-magnifi-module-classifier-04-17-relabelled-upsampled.yml",
"base_model": "Tifin-Sage/magnifi-classifier-01-05-search-agent-3-epochs-3k-unknown-errors",
"base_model_config": "Tifin-Sage/magnifi-classifier-01-05-search-agent-3-epochs-3k-unknown-errors",
"batch_size": 2,
"bf16": true,
"capabilities": {
"bf16": true,
"compute_capability": "sm_80",
"fp8": false,
"n_gpu": 2,
"n_node": 1,
"tf32": true
},
"chat_template": "qwen3",
"context_parallel_size": 1,
"dataloader_num_workers": 2,
"dataloader_pin_memory": true,
"dataloader_prefetch_factor": 256,
"dataset_num_proc": 128,
"dataset_prepared_path": "/workspace/data/datasets_prepared/magnifi-module-classifier-04-17-relabelled-upsampled",
"datasets": [
{
"chat_template": "tokenizer_default",
"field_messages": "messages",
"message_property_mappings": {
"content": "content",
"role": "role"
},
"path": "Tifin-Sage/magnifi-module-classifier-04-17-relabelled-upsampled",
"split": "train",
"trust_remote_code": false,
"type": "chat_template"
}
],
"ddp": true,
"device": "cuda:0",
"device_map": {
"": 0
},
"dion_rank_fraction": 1.0,
"dion_rank_multiple_of": 1,
"eaft_alpha": 1.0,
"eaft_k": 20,
"env_capabilities": {
"torch_version": "2.10.0"
},
"eval_batch_size": 1,
"eval_causal_lm_metrics": [
"sacrebleu",
"comet",
"ter",
"chrf"
],
"eval_max_new_tokens": 128,
"eval_sample_packing": true,
"eval_steps": 0.25,
"eval_table_size": 0,
"evals_per_epoch": 2,
"experimental_skip_move_to_device": true,
"fp16": false,
"fsdp": [
"full_shard",
"auto_wrap"
],
"fsdp_config": {
"activation_checkpointing": true,
"auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
"cpu_ram_efficient_loading": true,
"fsdp_version": 2,
"offload_params": false,
"reshard_after_forward": true,
"state_dict_type": "FULL_STATE_DICT",
"transformer_layer_cls_to_wrap": "Qwen3DecoderLayer"
},
"fsdp_version": 2,
"generate_samples": false,
"generation_do_sample": true,
"generation_max_new_tokens": 50,
"generation_prompt_ratio": 0.5,
"generation_temperature": 0.7,
"gradient_accumulation_steps": 1,
"gradient_checkpointing": false,
"hub_model_id": "Tifin-Sage/magnifi-module-classifier-04-17-relabelled-upsampled",
"include_tkps": true,
"layer_offloading": false,
"learning_rate": 2e-05,
"lisa_layers_attribute": "model.layers",
"load_best_model_at_end": false,
"load_in_4bit": false,
"load_in_8bit": false,
"local_rank": 0,
"logging_steps": 1,
"lora_dropout": 0.0,
"loraplus_lr_embedding": 1e-06,
"lr_scheduler": "cosine",
"mean_resizing_embeddings": false,
"merge_method": "memory_efficient",
"micro_batch_size": 1,
"model_config_type": "qwen3",
"num_epochs": 2.0,
"num_generation_samples": 3,
"optimizer": "adamw_torch_fused",
"otel_metrics_host": "localhost",
"otel_metrics_port": 8000,
"output_dir": "/workspace/data/outputs/qwen3-4B/fft_magnifi-module-classifier-04-17-relabelled-upsampled/",
"pad_to_sequence_len": true,
"pretrain_multipack_attn": true,
"profiler_steps_start": 0,
"qlora_sharded_model_loading": false,
"quantize_moe_experts": false,
"ray_num_workers": 1,
"resources_per_worker": {
"GPU": 1
},
"sample_packing": true,
"sample_packing_bin_size": 200,
"sample_packing_group_size": 100000,
"save_only_model": false,
"save_safetensors": true,
"save_steps": 0.5,
"saves_per_epoch": 1,
"sequence_len": 16000,
"shuffle_before_merging_datasets": false,
"shuffle_merged_datasets": true,
"skip_prepare_dataset": false,
"streaming_multipack_buffer_size": 10000,
"strict": false,
"tensor_parallel_size": 1,
"tf32": true,
"tiled_mlp_use_original_mlp": true,
"tokenizer_config": "Tifin-Sage/magnifi-classifier-01-05-search-agent-3-epochs-3k-unknown-errors",
"tokenizer_save_jinja_files": true,
"torch_dtype": "torch.bfloat16",
"train_on_inputs": false,
"trl": {
"async_prefetch": false,
"log_completions": false,
"mask_truncated_completions": false,
"ref_model_mixup_alpha": 0.9,
"ref_model_sync_steps": 64,
"replay_buffer_size": 0,
"replay_recompute_logps": true,
"reroll_max_groups": 1,
"reroll_start_fraction": 1.0,
"reward_num_workers": 1,
"scale_rewards": true,
"skip_zero_advantage_batches": true,
"sync_ref_model": false,
"use_data_producer": false,
"use_vllm": false,
"vllm_lora_sync": false,
"vllm_server_host": "0.0.0.0",
"vllm_server_port": 8000
},
"use_otel_metrics": false,
"use_ray": false,
"use_wandb": true,
"val_set_size": 0.1,
"vllm": {
"device": "auto",
"dtype": "auto",
"gpu_memory_utilization": 0.9,
"host": "0.0.0.0",
"port": 8000
},
"wandb_name": "magnifi-module-classifier-04-17-relabelled-upsampled",
"wandb_project": "sage-classifier",
"warmup_ratio": 0.1,
"weight_decay": 0.0,
"world_size": 2
}
[2026-04-17 02:08:51,607] [INFO] [axolotl.utils.data.shared] Loading prepared dataset from disk at /workspace/data/datasets_prepared/magnifi-module-classifier-04-17-relabelled-upsampled/6241b9d0f4bdccc4ed4f52e5adefd1bc...
[Gloo] Rank [Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1
0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1
[2026-04-17 02:08:57,019] [INFO] [axolotl.utils.samplers.multipack] gather_len_batches: [54, 54]
[2026-04-17 02:08:57,129] [INFO] [axolotl.utils.trainer] sample_packing_eff_est across ranks: [0.9649779796600342, 0.9649779796600342]
[2026-04-17 02:09:01,870] [INFO] [axolotl.utils.samplers.multipack] gather_len_batches: [478, 478]
[2026-04-17 02:09:01,872] [INFO] [axolotl.utils.trainer] sample_packing_eff_est across ranks: [0.969444751739502, 0.9775572419166565]
[2026-04-17 02:09:01,874] [INFO] [axolotl.utils.data.sft] Maximum number of steps set at 478
[2026-04-17 02:09:03,028] [INFO] [axolotl.loaders.patch_manager] Applying multipack dataloader patch for sample packing...
Fetching 2 files: 0%| | 0/2 [00:00<?, ?it/s] Fetching 2 files: 0%| | 0/2 [00:00<?, ?it/s] Fetching 2 files: 50%|█████ | 1/2 [00:05<00:05, 5.40s/it] Fetching 2 files: 100%|██████████| 2/2 [00:05<00:00, 2.70s/it]
Fetching 2 files: 50%|█████ | 1/2 [00:05<00:05, 5.42s/it] Fetching 2 files: 100%|██████████| 2/2 [00:05<00:00, 2.71s/it]
Loading weights: 0%| | 0/399 [00:00<?, ?it/s] Loading weights: 76%|███████▌ | 303/399 [00:00<00:00, 3028.93it/s] Loading weights: 100%|██████████| 399/399 [00:00<00:00, 2952.58it/s]
Loading weights: 0%| | 0/399 [00:00<?, ?it/s] Loading weights: 83%|████████▎ | 331/399 [00:00<00:00, 3306.31it/s] Loading weights: 100%|██████████| 399/399 [00:00<00:00, 3610.06it/s]
[2026-04-17 02:09:10,437] [INFO] [axolotl.loaders.model] Converting modules to torch.bfloat16
[2026-04-17 02:09:12,900] [WARNING] [accelerate.utils.dataclasses] sync_module_states is obsolete in FSDP2, as it is not needed anymore.Setting sync_module_states to None.
[2026-04-17 02:09:13,195] [WARNING] [accelerate.utils.dataclasses] sync_module_states is obsolete in FSDP2, as it is not needed anymore.Setting sync_module_states to None.
[2026-04-17 02:09:13,778] [INFO] [axolotl.train] Pre-saving tokenizer to /workspace/data/outputs/qwen3-4B/fft_magnifi-module-classifier-04-17-relabelled-upsampled/...
[2026-04-17 02:09:13,930] [INFO] [axolotl.train] Pre-saving model config to /workspace/data/outputs/qwen3-4B/fft_magnifi-module-classifier-04-17-relabelled-upsampled/...
[2026-04-17 02:09:13,980] [INFO] [axolotl.train] Starting trainer...
[2026-04-17 02:09:19,915] [INFO] [axolotl.utils.samplers.multipack] gather_len_batches: [480, 480]
[2026-04-17 02:09:20,072] [INFO] [axolotl.monkeypatch.accelerate.fsdp2] Broadcasting full state dict to all ranks...
wandb: [wandb.login()] Loaded credentials for https://api.wandb.ai from WANDB_API_KEY.
wandb: Currently logged in as: subhanandh-t (subhanandh-t-tifin) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
wandb: setting up run 4dxwgvhh
wandb: Tracking run with wandb version 0.26.0
wandb: Run data is saved locally in /workspace/wandb/run-20260417_020921-4dxwgvhh
wandb: Run `wandb offline` to turn off syncing.
wandb: Syncing run magnifi-module-classifier-04-17-relabelled-upsampled
wandb: ⭐️ View project at https://wandb.ai/subhanandh-t-tifin/sage-classifier
wandb: 🚀 View run at https://wandb.ai/subhanandh-t-tifin/sage-classifier/runs/4dxwgvhh
wandb: WARNING Saving files without folders. If you want to preserve subdirectories pass base_path to wandb.save, i.e. wandb.save("/mnt/folder/file.h5", base_path="/mnt")
wandb: WARNING Symlinked 1 file into the W&B run directory; call wandb.save again to sync new files.
[2026-04-17 02:09:24,132] [INFO] [axolotl.utils.callbacks] The Axolotl config has been saved to the WandB run under files.
0%| | 0/478 [00:00<?, ?it/s][2026-04-17 02:09:24,137] [INFO] [axolotl.core.trainers.base] Running evaluation step...
[2026-04-17 02:09:29,462] [INFO] [axolotl.utils.samplers.multipack] gather_len_batches: [54, 54]
0%| | 0/27 [00:00<?, ?it/s]
7%|▋ | 2/27 [00:01<00:18, 1.37it/s]
11%|█ | 3/27 [00:04<00:37, 1.57s/it]
15%|█▍ | 4/27 [00:06<00:46, 2.01s/it]
19%|█▊ | 5/27 [00:09<00:49, 2.27s/it]
22%|██▏ | 6/27 [00:12<00:51, 2.43s/it]
26%|██▌ | 7/27 [00:15<00:50, 2.54s/it]
30%|██▉ | 8/27 [00:17<00:49, 2.60s/it]
33%|███▎ | 9/27 [00:20<00:47, 2.65s/it]
37%|███▋ | 10/27 [00:23<00:45, 2.68s/it]
41%|████ | 11/27 [00:26<00:43, 2.70s/it]
44%|████▍ | 12/27 [00:28<00:39, 2.63s/it]
48%|████▊ | 13/27 [00:31<00:38, 2.76s/it]
52%|█████▏ | 14/27 [00:34<00:35, 2.77s/it]
56%|█████▌ | 15/27 [00:37<00:33, 2.76s/it]
59%|█████▉ | 16/27 [00:40<00:30, 2.76s/it]
63%|██████▎ | 17/27 [00:42<00:27, 2.76s/it]
67%|██████▋ | 18/27 [00:45<00:24, 2.76s/it]
70%|███████ | 19/27 [00:48<00:22, 2.76s/it]
74%|███████▍ | 20/27 [00:51<00:19, 2.77s/it]
78%|███████▊ | 21/27 [00:53<00:16, 2.68s/it]
81%|████████▏ | 22/27 [00:56<00:13, 2.79s/it]
85%|████████▌ | 23/27 [00:59<00:11, 2.78s/it]
89%|████████▉ | 24/27 [01:02<00:08, 2.77s/it]
93%|█████████▎| 25/27 [01:04<00:05, 2.77s/it]
96%|█████████▋| 26/27 [01:07<00:02, 2.76s/it]
100%|██████████| 27/27 [01:10<00:00, 2.79s/it]
{'eval_loss': '0.2049', 'eval_runtime': '75.78', 'eval_samples_per_second': '2.758', 'eval_steps_per_second': '1.386', 'eval_ppl': '1.227', 'memory/max_active (GiB)': '27.41', 'memory/max_allocated (GiB)': '27.41', 'memory/device_reserved (GiB)': '30.62', 'epoch': 0}
0%| | 0/478 [01:21<?, ?it/s]
100%|██████████| 27/27 [01:11<00:00, 2.79s/it]
0%| | 1/478 [01:37<12:54:11, 97.38s/it] {'loss': '0.1978', 'grad_norm': '6.188', 'learning_rate': '0', 'ppl': '1.219', 'memory/max_active (GiB)': '37.9', 'memory/max_allocated (GiB)': '37.9', 'memory/device_reserved (GiB)': '49.99', 'tokens/train_per_sec_per_gpu': '22.08', 'tokens/total': 32000, 'tokens/trainable': 687, 'epoch': '0.004167'}
0%| | 1/478 [01:37<12:54:11, 97.38s/it] 0%| | 2/478 [01:52<6:27:59, 48.91s/it] {'loss': '0.2119', 'grad_norm': '6.781', 'learning_rate': '4.255e-07', 'ppl': '1.236', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.84', 'tokens/total': 64000, 'tokens/trainable': 1310, 'epoch': '0.008333'}
0%| | 2/478 [01:52<6:27:59, 48.91s/it] 1%| | 3/478 [02:07<4:24:28, 33.41s/it] {'loss': '0.1929', 'grad_norm': '7.031', 'learning_rate': '8.511e-07', 'ppl': '1.213', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '18.47', 'tokens/total': 96000, 'tokens/trainable': 1862, 'epoch': '0.0125'}
1%| | 3/478 [02:07<4:24:28, 33.41s/it] 1%| | 4/478 [02:22<3:26:25, 26.13s/it] {'loss': '0.1885', 'grad_norm': '8.938', 'learning_rate': '1.277e-06', 'ppl': '1.207', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '18.06', 'tokens/total': 128000, 'tokens/trainable': 2402, 'epoch': '0.01667'}
1%| | 4/478 [02:22<3:26:25, 26.13s/it] 1%| | 5/478 [02:37<2:54:16, 22.11s/it] {'loss': '0.1914', 'grad_norm': '6.75', 'learning_rate': '1.702e-06', 'ppl': '1.211', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '18.36', 'tokens/total': 160000, 'tokens/trainable': 2951, 'epoch': '0.02083'}
1%| | 5/478 [02:37<2:54:16, 22.11s/it] 1%|▏ | 6/478 [02:52<2:34:49, 19.68s/it] {'loss': '0.1206', 'grad_norm': '4.875', 'learning_rate': '2.128e-06', 'ppl': '1.128', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.83', 'tokens/total': 192000, 'tokens/trainable': 3574, 'epoch': '0.025'}
1%|▏ | 6/478 [02:52<2:34:49, 19.68s/it] 1%|▏ | 7/478 [03:07<2:22:25, 18.14s/it] {'loss': '0.2275', 'grad_norm': '9', 'learning_rate': '2.553e-06', 'ppl': '1.256', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '19.33', 'tokens/total': 224000, 'tokens/trainable': 4152, 'epoch': '0.02917'}
1%|▏ | 7/478 [03:07<2:22:25, 18.14s/it] 2%|▏ | 8/478 [03:22<2:14:12, 17.13s/it] {'loss': '0.2021', 'grad_norm': '7.312', 'learning_rate': '2.979e-06', 'ppl': '1.224', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '19.73', 'tokens/total': 256000, 'tokens/trainable': 4742, 'epoch': '0.03333'}
2%|▏ | 8/478 [03:22<2:14:12, 17.13s/it] 2%|▏ | 9/478 [03:37<2:08:38, 16.46s/it] {'loss': '0.1592', 'grad_norm': '4.969', 'learning_rate': '3.404e-06', 'ppl': '1.173', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.34', 'tokens/total': 288000, 'tokens/trainable': 5410, 'epoch': '0.0375'}
2%|▏ | 9/478 [03:37<2:08:38, 16.46s/it] 2%|▏ | 10/478 [03:52<2:04:46, 16.00s/it] {'loss': '0.1953', 'grad_norm': '5.938', 'learning_rate': '3.83e-06', 'ppl': '1.216', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.77', 'tokens/total': 320000, 'tokens/trainable': 6031, 'epoch': '0.04167'}
2%|▏ | 10/478 [03:52<2:04:46, 16.00s/it] 2%|▏ | 11/478 [04:07<2:02:05, 15.69s/it] {'loss': '0.1826', 'grad_norm': '5.5', 'learning_rate': '4.255e-06', 'ppl': '1.2', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '19.52', 'tokens/total': 352000, 'tokens/trainable': 6615, 'epoch': '0.04583'}
2%|▏ | 11/478 [04:07<2:02:05, 15.69s/it] 3%|▎ | 12/478 [04:22<2:00:07, 15.47s/it] {'loss': '0.2266', 'grad_norm': '6.594', 'learning_rate': '4.681e-06', 'ppl': '1.254', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.01', 'tokens/total': 384000, 'tokens/trainable': 7213, 'epoch': '0.05'}
3%|▎ | 12/478 [04:22<2:00:07, 15.47s/it] 3%|▎ | 13/478 [04:37<1:58:41, 15.32s/it] {'loss': '0.248', 'grad_norm': '7.031', 'learning_rate': '5.106e-06', 'ppl': '1.282', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '18.44', 'tokens/total': 416000, 'tokens/trainable': 7764, 'epoch': '0.05417'}
3%|▎ | 13/478 [04:37<1:58:41, 15.32s/it] 3%|▎ | 14/478 [04:52<1:57:38, 15.21s/it] {'loss': '0.1992', 'grad_norm': '6.344', 'learning_rate': '5.532e-06', 'ppl': '1.22', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '18.93', 'tokens/total': 448000, 'tokens/trainable': 8330, 'epoch': '0.05833'}
3%|▎ | 14/478 [04:52<1:57:38, 15.21s/it] 3%|▎ | 15/478 [05:06<1:56:49, 15.14s/it] {'loss': '0.1826', 'grad_norm': '7.469', 'learning_rate': '5.957e-06', 'ppl': '1.2', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.44', 'tokens/total': 480000, 'tokens/trainable': 9001, 'epoch': '0.0625'}
3%|▎ | 15/478 [05:07<1:56:49, 15.14s/it] 3%|▎ | 16/478 [05:21<1:56:10, 15.09s/it] {'loss': '0.1689', 'grad_norm': '5.688', 'learning_rate': '6.383e-06', 'ppl': '1.184', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '21.78', 'tokens/total': 512000, 'tokens/trainable': 9652, 'epoch': '0.06667'}
3%|▎ | 16/478 [05:21<1:56:10, 15.09s/it] 4%|▎ | 17/478 [05:36<1:55:34, 15.04s/it] {'loss': '0.1113', 'grad_norm': '6.594', 'learning_rate': '6.809e-06', 'ppl': '1.118', 'memory/max_active (GiB)': '45.17', 'memory/max_allocated (GiB)': '45.17', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '23.16', 'tokens/total': 544000, 'tokens/trainable': 10343, 'epoch': '0.07083'}
4%|▎ | 17/478 [05:36<1:55:34, 15.04s/it] 4%|▍ | 18/478 [05:51<1:55:09, 15.02s/it] {'loss': '0.1919', 'grad_norm': '6.281', 'learning_rate': '7.234e-06', 'ppl': '1.212', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.71', 'tokens/total': 576000, 'tokens/trainable': 11022, 'epoch': '0.075'}
4%|▍ | 18/478 [05:51<1:55:09, 15.02s/it] 4%|▍ | 19/478 [06:06<1:54:47, 15.00s/it] {'loss': '0.2617', 'grad_norm': '7.594', 'learning_rate': '7.66e-06', 'ppl': '1.299', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '18.8', 'tokens/total': 608000, 'tokens/trainable': 11584, 'epoch': '0.07917'}
4%|▍ | 19/478 [06:06<1:54:47, 15.00s/it] 4%|▍ | 20/478 [06:21<1:54:27, 14.99s/it] {'loss': '0.1699', 'grad_norm': '6.531', 'learning_rate': '8.085e-06', 'ppl': '1.185', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '21.31', 'tokens/total': 640000, 'tokens/trainable': 12221, 'epoch': '0.08333'}
4%|▍ | 20/478 [06:21<1:54:27, 14.99s/it] 4%|▍ | 21/478 [06:36<1:54:08, 14.99s/it] {'loss': '0.1992', 'grad_norm': '7.25', 'learning_rate': '8.511e-06', 'ppl': '1.22', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.05', 'tokens/total': 672000, 'tokens/trainable': 12880, 'epoch': '0.0875'}
4%|▍ | 21/478 [06:36<1:54:08, 14.99s/it] 5%|▍ | 22/478 [06:51<1:53:51, 14.98s/it] {'loss': '0.2231', 'grad_norm': '6.562', 'learning_rate': '8.936e-06', 'ppl': '1.25', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.91', 'tokens/total': 704000, 'tokens/trainable': 13505, 'epoch': '0.09167'}
5%|▍ | 22/478 [06:51<1:53:51, 14.98s/it] 5%|▍ | 23/478 [07:06<1:53:32, 14.97s/it] {'loss': '0.2583', 'grad_norm': '8.5', 'learning_rate': '9.362e-06', 'ppl': '1.295', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.84', 'tokens/total': 736000, 'tokens/trainable': 14187, 'epoch': '0.09583'}
5%|▍ | 23/478 [07:06<1:53:32, 14.97s/it] 5%|▌ | 24/478 [07:21<1:53:16, 14.97s/it] {'loss': '0.1807', 'grad_norm': '7.062', 'learning_rate': '9.787e-06', 'ppl': '1.198', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '16.49', 'tokens/total': 768000, 'tokens/trainable': 14680, 'epoch': '0.1'}
5%|▌ | 24/478 [07:21<1:53:16, 14.97s/it] 5%|▌ | 25/478 [07:36<1:53:01, 14.97s/it] {'loss': '0.2788', 'grad_norm': '6.625', 'learning_rate': '1.021e-05', 'ppl': '1.322', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '21.58', 'tokens/total': 800000, 'tokens/trainable': 15325, 'epoch': '0.1042'}
5%|▌ | 25/478 [07:36<1:53:01, 14.97s/it] 5%|▌ | 26/478 [07:51<1:52:46, 14.97s/it] {'loss': '0.1494', 'grad_norm': '4.75', 'learning_rate': '1.064e-05', 'ppl': '1.161', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '19.33', 'tokens/total': 832000, 'tokens/trainable': 15903, 'epoch': '0.1083'}
5%|▌ | 26/478 [07:51<1:52:46, 14.97s/it] 6%|▌ | 27/478 [08:06<1:52:31, 14.97s/it] {'loss': '0.1714', 'grad_norm': '5.812', 'learning_rate': '1.106e-05', 'ppl': '1.187', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '21.14', 'tokens/total': 864000, 'tokens/trainable': 16535, 'epoch': '0.1125'}
6%|▌ | 27/478 [08:06<1:52:31, 14.97s/it] 6%|▌ | 28/478 [08:21<1:52:13, 14.96s/it] {'loss': '0.2041', 'grad_norm': '6.062', 'learning_rate': '1.149e-05', 'ppl': '1.226', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.99', 'tokens/total': 896000, 'tokens/trainable': 17221, 'epoch': '0.1167'}
6%|▌ | 28/478 [08:21<1:52:13, 14.96s/it] 6%|▌ | 29/478 [08:36<1:51:59, 14.96s/it] {'loss': '0.1694', 'grad_norm': '4.969', 'learning_rate': '1.191e-05', 'ppl': '1.185', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.54', 'tokens/total': 928000, 'tokens/trainable': 17895, 'epoch': '0.1208'}
6%|▌ | 29/478 [08:36<1:51:59, 14.96s/it] 6%|▋ | 30/478 [08:51<1:51:45, 14.97s/it] {'loss': '0.1841', 'grad_norm': '5.438', 'learning_rate': '1.234e-05', 'ppl': '1.202', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '24.18', 'tokens/total': 960000, 'tokens/trainable': 18618, 'epoch': '0.125'}
6%|▋ | 30/478 [08:51<1:51:45, 14.97s/it] 6%|▋ | 31/478 [09:06<1:51:30, 14.97s/it] {'loss': '0.1812', 'grad_norm': '5.281', 'learning_rate': '1.277e-05', 'ppl': '1.199', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '21.14', 'tokens/total': 992000, 'tokens/trainable': 19250, 'epoch': '0.1292'}
6%|▋ | 31/478 [09:06<1:51:30, 14.97s/it] 7%|▋ | 32/478 [09:21<1:51:16, 14.97s/it] {'loss': '0.2358', 'grad_norm': '5.875', 'learning_rate': '1.319e-05', 'ppl': '1.266', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.54', 'tokens/total': 1024000, 'tokens/trainable': 19924, 'epoch': '0.1333'}
7%|▋ | 32/478 [09:21<1:51:16, 14.97s/it] 7%|▋ | 33/478 [09:36<1:51:01, 14.97s/it] {'loss': '0.1631', 'grad_norm': '5.312', 'learning_rate': '1.362e-05', 'ppl': '1.177', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.76', 'tokens/total': 1056000, 'tokens/trainable': 20545, 'epoch': '0.1375'}
7%|▋ | 33/478 [09:36<1:51:01, 14.97s/it] 7%|▋ | 34/478 [09:51<1:50:46, 14.97s/it] {'loss': '0.269', 'grad_norm': '6.281', 'learning_rate': '1.404e-05', 'ppl': '1.309', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.13', 'tokens/total': 1088000, 'tokens/trainable': 21147, 'epoch': '0.1417'}
7%|▋ | 34/478 [09:51<1:50:46, 14.97s/it] 7%|▋ | 35/478 [10:06<1:50:32, 14.97s/it] {'loss': '0.2339', 'grad_norm': '7', 'learning_rate': '1.447e-05', 'ppl': '1.264', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '23.01', 'tokens/total': 1120000, 'tokens/trainable': 21835, 'epoch': '0.1458'}
7%|▋ | 35/478 [10:06<1:50:32, 14.97s/it] 8%|▊ | 36/478 [10:21<1:50:17, 14.97s/it] {'loss': '0.1953', 'grad_norm': '5.5', 'learning_rate': '1.489e-05', 'ppl': '1.216', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.4', 'tokens/total': 1152000, 'tokens/trainable': 22505, 'epoch': '0.15'}
8%|▊ | 36/478 [10:21<1:50:17, 14.97s/it] 8%|▊ | 37/478 [10:36<1:50:02, 14.97s/it] {'loss': '0.1743', 'grad_norm': '5.344', 'learning_rate': '1.532e-05', 'ppl': '1.19', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '19.26', 'tokens/total': 1184000, 'tokens/trainable': 23081, 'epoch': '0.1542'}
8%|▊ | 37/478 [10:36<1:50:02, 14.97s/it] 8%|▊ | 38/478 [10:51<1:49:51, 14.98s/it] {'loss': '0.2637', 'grad_norm': '6.031', 'learning_rate': '1.574e-05', 'ppl': '1.302', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.96', 'tokens/total': 1216000, 'tokens/trainable': 23769, 'epoch': '0.1583'}
8%|▊ | 38/478 [10:51<1:49:51, 14.98s/it] 8%|▊ | 39/478 [11:06<1:49:32, 14.97s/it] {'loss': '0.1641', 'grad_norm': '4.688', 'learning_rate': '1.617e-05', 'ppl': '1.178', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '24.92', 'tokens/total': 1248000, 'tokens/trainable': 24513, 'epoch': '0.1625'}
8%|▊ | 39/478 [11:06<1:49:32, 14.97s/it] 8%|▊ | 40/478 [11:21<1:49:17, 14.97s/it] {'loss': '0.1709', 'grad_norm': '4.938', 'learning_rate': '1.66e-05', 'ppl': '1.186', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.32', 'tokens/total': 1280000, 'tokens/trainable': 25121, 'epoch': '0.1667'}
8%|▊ | 40/478 [11:21<1:49:17, 14.97s/it] 9%|▊ | 41/478 [11:36<1:49:02, 14.97s/it] {'loss': '0.1924', 'grad_norm': '5.562', 'learning_rate': '1.702e-05', 'ppl': '1.212', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '21.4', 'tokens/total': 1312000, 'tokens/trainable': 25761, 'epoch': '0.1708'}
9%|▊ | 41/478 [11:36<1:49:02, 14.97s/it] 9%|▉ | 42/478 [11:51<1:48:46, 14.97s/it] {'loss': '0.1841', 'grad_norm': '4.969', 'learning_rate': '1.745e-05', 'ppl': '1.202', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '24.32', 'tokens/total': 1344000, 'tokens/trainable': 26488, 'epoch': '0.175'}
9%|▉ | 42/478 [11:51<1:48:46, 14.97s/it] 9%|▉ | 43/478 [12:06<1:48:28, 14.96s/it] {'loss': '0.2368', 'grad_norm': '5.562', 'learning_rate': '1.787e-05', 'ppl': '1.267', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.31', 'tokens/total': 1376000, 'tokens/trainable': 27154, 'epoch': '0.1792'}
9%|▉ | 43/478 [12:06<1:48:28, 14.96s/it] 9%|▉ | 44/478 [12:21<1:48:14, 14.96s/it] {'loss': '0.2148', 'grad_norm': '5.625', 'learning_rate': '1.83e-05', 'ppl': '1.24', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '21.67', 'tokens/total': 1408000, 'tokens/trainable': 27802, 'epoch': '0.1833'}
9%|▉ | 44/478 [12:21<1:48:14, 14.96s/it] 9%|▉ | 45/478 [12:35<1:48:00, 14.97s/it] {'loss': '0.228', 'grad_norm': '5.906', 'learning_rate': '1.872e-05', 'ppl': '1.256', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.5', 'tokens/total': 1440000, 'tokens/trainable': 28475, 'epoch': '0.1875'}
9%|▉ | 45/478 [12:36<1:48:00, 14.97s/it] 10%|▉ | 46/478 [12:50<1:47:45, 14.97s/it] {'loss': '0.1973', 'grad_norm': '5.75', 'learning_rate': '1.915e-05', 'ppl': '1.218', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.88', 'tokens/total': 1472000, 'tokens/trainable': 29099, 'epoch': '0.1917'}
10%|▉ | 46/478 [12:50<1:47:45, 14.97s/it] 10%|▉ | 47/478 [13:05<1:47:29, 14.97s/it] {'loss': '0.1963', 'grad_norm': '5.312', 'learning_rate': '1.957e-05', 'ppl': '1.217', 'memory/max_active (GiB)': '45.17', 'memory/max_allocated (GiB)': '45.17', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.48', 'tokens/total': 1504000, 'tokens/trainable': 29771, 'epoch': '0.1958'}
10%|▉ | 47/478 [13:05<1:47:29, 14.97s/it] 10%|█ | 48/478 [13:20<1:47:15, 14.97s/it] {'loss': '0.2627', 'grad_norm': '7.156', 'learning_rate': '2e-05', 'ppl': '1.3', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '18.63', 'tokens/total': 1536000, 'tokens/trainable': 30328, 'epoch': '0.2'}
10%|█ | 48/478 [13:20<1:47:15, 14.97s/it] 10%|█ | 49/478 [13:35<1:47:00, 14.97s/it] {'loss': '0.2505', 'grad_norm': '7.438', 'learning_rate': '2e-05', 'ppl': '1.285', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.57', 'tokens/total': 1568000, 'tokens/trainable': 30943, 'epoch': '0.2042'}
10%|█ | 49/478 [13:35<1:47:00, 14.97s/it] 10%|█ | 50/478 [13:50<1:46:44, 14.96s/it] {'loss': '0.2026', 'grad_norm': '6.75', 'learning_rate': '2e-05', 'ppl': '1.225', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '18.94', 'tokens/total': 1600000, 'tokens/trainable': 31509, 'epoch': '0.2083'}
10%|█ | 50/478 [13:50<1:46:44, 14.96s/it] 11%|█ | 51/478 [14:05<1:46:29, 14.96s/it] {'loss': '0.2368', 'grad_norm': '6.281', 'learning_rate': '2e-05', 'ppl': '1.267', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '18.24', 'tokens/total': 1632000, 'tokens/trainable': 32054, 'epoch': '0.2125'}
11%|█ | 51/478 [14:05<1:46:29, 14.96s/it] 11%|█ | 52/478 [14:20<1:46:06, 14.94s/it] {'loss': '0.2344', 'grad_norm': '6.281', 'learning_rate': '2e-05', 'ppl': '1.264', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '19.65', 'tokens/total': 1664000, 'tokens/trainable': 32639, 'epoch': '0.2167'}
11%|█ | 52/478 [14:20<1:46:06, 14.94s/it] 11%|█ | 53/478 [14:35<1:45:54, 14.95s/it] {'loss': '0.2012', 'grad_norm': '5.188', 'learning_rate': '1.999e-05', 'ppl': '1.223', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '21.51', 'tokens/total': 1696000, 'tokens/trainable': 33282, 'epoch': '0.2208'}
11%|█ | 53/478 [14:35<1:45:54, 14.95s/it] 11%|█▏ | 54/478 [14:50<1:45:37, 14.95s/it] {'loss': '0.2617', 'grad_norm': '5.656', 'learning_rate': '1.999e-05', 'ppl': '1.299', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '21.75', 'tokens/total': 1728000, 'tokens/trainable': 33931, 'epoch': '0.225'}
11%|█▏ | 54/478 [14:50<1:45:37, 14.95s/it] 12%|█▏ | 55/478 [15:05<1:45:22, 14.95s/it] {'loss': '0.207', 'grad_norm': '5.312', 'learning_rate': '1.999e-05', 'ppl': '1.23', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '23.32', 'tokens/total': 1760000, 'tokens/trainable': 34627, 'epoch': '0.2292'}
12%|█▏ | 55/478 [15:05<1:45:22, 14.95s/it] 12%|█▏ | 56/478 [15:20<1:45:09, 14.95s/it] {'loss': '0.1846', 'grad_norm': '5.812', 'learning_rate': '1.998e-05', 'ppl': '1.203', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '18.67', 'tokens/total': 1792000, 'tokens/trainable': 35185, 'epoch': '0.2333'}
12%|█▏ | 56/478 [15:20<1:45:09, 14.95s/it] 12%|█▏ | 57/478 [15:35<1:44:53, 14.95s/it] {'loss': '0.207', 'grad_norm': '5.594', 'learning_rate': '1.998e-05', 'ppl': '1.23', 'memory/max_active (GiB)': '45.17', 'memory/max_allocated (GiB)': '45.17', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.99', 'tokens/total': 1824000, 'tokens/trainable': 35871, 'epoch': '0.2375'}
12%|█▏ | 57/478 [15:35<1:44:53, 14.95s/it] 12%|█▏ | 58/478 [15:50<1:44:40, 14.95s/it] {'loss': '0.2261', 'grad_norm': '6', 'learning_rate': '1.997e-05', 'ppl': '1.254', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '18.53', 'tokens/total': 1856000, 'tokens/trainable': 36425, 'epoch': '0.2417'}
12%|█▏ | 58/478 [15:50<1:44:40, 14.95s/it] 12%|█▏ | 59/478 [16:05<1:44:27, 14.96s/it] {'loss': '0.1909', 'grad_norm': '6.75', 'learning_rate': '1.997e-05', 'ppl': '1.21', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.18', 'tokens/total': 1888000, 'tokens/trainable': 37028, 'epoch': '0.2458'}
12%|█▏ | 59/478 [16:05<1:44:27, 14.96s/it] 13%|█▎ | 60/478 [16:20<1:44:13, 14.96s/it] {'loss': '0.2793', 'grad_norm': '7.062', 'learning_rate': '1.996e-05', 'ppl': '1.322', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.88', 'tokens/total': 1920000, 'tokens/trainable': 37712, 'epoch': '0.25'}
13%|█▎ | 60/478 [16:20<1:44:13, 14.96s/it] 13%|█▎ | 61/478 [16:35<1:43:51, 14.94s/it] {'loss': '0.2065', 'grad_norm': '5.125', 'learning_rate': '1.996e-05', 'ppl': '1.229', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '23.98', 'tokens/total': 1952000, 'tokens/trainable': 38426, 'epoch': '0.2542'}
13%|█▎ | 61/478 [16:35<1:43:51, 14.94s/it] 13%|█▎ | 62/478 [16:50<1:43:40, 14.95s/it] {'loss': '0.1821', 'grad_norm': '5.156', 'learning_rate': '1.995e-05', 'ppl': '1.2', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.67', 'tokens/total': 1984000, 'tokens/trainable': 39044, 'epoch': '0.2583'}
13%|█▎ | 62/478 [16:50<1:43:40, 14.95s/it] 13%|█▎ | 63/478 [17:05<1:43:26, 14.96s/it] {'loss': '0.2495', 'grad_norm': '6.406', 'learning_rate': '1.994e-05', 'ppl': '1.283', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.31', 'tokens/total': 2016000, 'tokens/trainable': 39651, 'epoch': '0.2625'}
13%|█▎ | 63/478 [17:05<1:43:26, 14.96s/it] 13%|█▎ | 64/478 [17:20<1:43:12, 14.96s/it] {'loss': '0.2432', 'grad_norm': '5.375', 'learning_rate': '1.993e-05', 'ppl': '1.275', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.92', 'tokens/total': 2048000, 'tokens/trainable': 40336, 'epoch': '0.2667'}
13%|█▎ | 64/478 [17:20<1:43:12, 14.96s/it] 14%|█▎ | 65/478 [17:35<1:42:58, 14.96s/it] {'loss': '0.1392', 'grad_norm': '4.469', 'learning_rate': '1.992e-05', 'ppl': '1.149', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '18.03', 'tokens/total': 2080000, 'tokens/trainable': 40875, 'epoch': '0.2708'}
14%|█▎ | 65/478 [17:35<1:42:58, 14.96s/it] 14%|█▍ | 66/478 [17:50<1:42:45, 14.96s/it] {'loss': '0.2319', 'grad_norm': '5.562', 'learning_rate': '1.991e-05', 'ppl': '1.261', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '21.24', 'tokens/total': 2112000, 'tokens/trainable': 41510, 'epoch': '0.275'}
14%|█▍ | 66/478 [17:50<1:42:45, 14.96s/it] 14%|█▍ | 67/478 [18:05<1:42:31, 14.97s/it] {'loss': '0.189', 'grad_norm': '4.938', 'learning_rate': '1.99e-05', 'ppl': '1.208', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.07', 'tokens/total': 2144000, 'tokens/trainable': 42170, 'epoch': '0.2792'}
14%|█▍ | 67/478 [18:05<1:42:31, 14.97s/it] 14%|█▍ | 68/478 [18:20<1:42:16, 14.97s/it] {'loss': '0.2231', 'grad_norm': '7.812', 'learning_rate': '1.989e-05', 'ppl': '1.25', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '16.62', 'tokens/total': 2176000, 'tokens/trainable': 42667, 'epoch': '0.2833'}
14%|█▍ | 68/478 [18:20<1:42:16, 14.97s/it] 14%|█▍ | 69/478 [18:34<1:42:01, 14.97s/it] {'loss': '0.1948', 'grad_norm': '5.219', 'learning_rate': '1.988e-05', 'ppl': '1.215', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '17.96', 'tokens/total': 2208000, 'tokens/trainable': 43204, 'epoch': '0.2875'}
14%|█▍ | 69/478 [18:34<1:42:01, 14.97s/it] 15%|█▍ | 70/478 [18:49<1:41:46, 14.97s/it] {'loss': '0.146', 'grad_norm': '4.156', 'learning_rate': '1.987e-05', 'ppl': '1.157', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.14', 'tokens/total': 2240000, 'tokens/trainable': 43866, 'epoch': '0.2917'}
15%|█▍ | 70/478 [18:49<1:41:46, 14.97s/it] 15%|█▍ | 71/478 [19:04<1:41:31, 14.97s/it] {'loss': '0.2588', 'grad_norm': '5.969', 'learning_rate': '1.986e-05', 'ppl': '1.295', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.85', 'tokens/total': 2272000, 'tokens/trainable': 44549, 'epoch': '0.2958'}
15%|█▍ | 71/478 [19:04<1:41:31, 14.97s/it] 15%|█▌ | 72/478 [19:19<1:41:16, 14.97s/it] {'loss': '0.1719', 'grad_norm': '4.938', 'learning_rate': '1.985e-05', 'ppl': '1.188', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.27', 'tokens/total': 2304000, 'tokens/trainable': 45155, 'epoch': '0.3'}
15%|█▌ | 72/478 [19:19<1:41:16, 14.97s/it] 15%|█▌ | 73/478 [19:34<1:41:02, 14.97s/it] {'loss': '0.1885', 'grad_norm': '5.781', 'learning_rate': '1.983e-05', 'ppl': '1.207', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '19.49', 'tokens/total': 2336000, 'tokens/trainable': 45738, 'epoch': '0.3042'}
15%|█▌ | 73/478 [19:34<1:41:02, 14.97s/it] 15%|█▌ | 74/478 [19:49<1:40:43, 14.96s/it] {'loss': '0.2168', 'grad_norm': '6.219', 'learning_rate': '1.982e-05', 'ppl': '1.242', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '21.08', 'tokens/total': 2368000, 'tokens/trainable': 46367, 'epoch': '0.3083'}
15%|█▌ | 74/478 [19:49<1:40:43, 14.96s/it] 16%|█▌ | 75/478 [20:04<1:40:29, 14.96s/it] {'loss': '0.1719', 'grad_norm': '5.375', 'learning_rate': '1.981e-05', 'ppl': '1.188', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '19.66', 'tokens/total': 2400000, 'tokens/trainable': 46955, 'epoch': '0.3125'}
16%|█▌ | 75/478 [20:04<1:40:29, 14.96s/it] 16%|█▌ | 76/478 [20:19<1:40:16, 14.97s/it] {'loss': '0.2334', 'grad_norm': '6.188', 'learning_rate': '1.979e-05', 'ppl': '1.263', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.3', 'tokens/total': 2432000, 'tokens/trainable': 47562, 'epoch': '0.3167'}
16%|█▌ | 76/478 [20:19<1:40:16, 14.97s/it] 16%|█▌ | 77/478 [20:34<1:40:01, 14.97s/it] {'loss': '0.3389', 'grad_norm': '7.906', 'learning_rate': '1.978e-05', 'ppl': '1.403', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '19', 'tokens/total': 2464000, 'tokens/trainable': 48130, 'epoch': '0.3208'}
16%|█▌ | 77/478 [20:34<1:40:01, 14.97s/it] 16%|█▋ | 78/478 [20:49<1:39:50, 14.98s/it] {'loss': '0.1982', 'grad_norm': '4.531', 'learning_rate': '1.976e-05', 'ppl': '1.219', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '21.93', 'tokens/total': 2496000, 'tokens/trainable': 48787, 'epoch': '0.325'}
16%|█▋ | 78/478 [20:49<1:39:50, 14.98s/it] 17%|█▋ | 79/478 [21:04<1:39:34, 14.97s/it] {'loss': '0.2163', 'grad_norm': '5.594', 'learning_rate': '1.975e-05', 'ppl': '1.241', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '21', 'tokens/total': 2528000, 'tokens/trainable': 49415, 'epoch': '0.3292'}
17%|█▋ | 79/478 [21:04<1:39:34, 14.97s/it] 17%|█▋ | 80/478 [21:19<1:39:15, 14.96s/it] {'loss': '0.1899', 'grad_norm': '4.906', 'learning_rate': '1.973e-05', 'ppl': '1.209', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '21.99', 'tokens/total': 2560000, 'tokens/trainable': 50071, 'epoch': '0.3333'}
17%|█▋ | 80/478 [21:19<1:39:15, 14.96s/it] 17%|█▋ | 81/478 [21:34<1:38:58, 14.96s/it] {'loss': '0.2627', 'grad_norm': '6.5', 'learning_rate': '1.971e-05', 'ppl': '1.3', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.61', 'tokens/total': 2592000, 'tokens/trainable': 50746, 'epoch': '0.3375'}
17%|█▋ | 81/478 [21:34<1:38:58, 14.96s/it] 17%|█▋ | 82/478 [21:49<1:38:46, 14.96s/it] {'loss': '0.1797', 'grad_norm': '5.031', 'learning_rate': '1.969e-05', 'ppl': '1.197', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.52', 'tokens/total': 2624000, 'tokens/trainable': 51360, 'epoch': '0.3417'}
17%|█▋ | 82/478 [21:49<1:38:46, 14.96s/it] 17%|█▋ | 83/478 [22:04<1:38:30, 14.96s/it] {'loss': '0.1689', 'grad_norm': '7.531', 'learning_rate': '1.968e-05', 'ppl': '1.184', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '16.22', 'tokens/total': 2656000, 'tokens/trainable': 51845, 'epoch': '0.3458'}
17%|█▋ | 83/478 [22:04<1:38:30, 14.96s/it] 18%|█▊ | 84/478 [22:19<1:38:15, 14.96s/it] {'loss': '0.1943', 'grad_norm': '5.688', 'learning_rate': '1.966e-05', 'ppl': '1.214', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.68', 'tokens/total': 2688000, 'tokens/trainable': 52523, 'epoch': '0.35'}
18%|█▊ | 84/478 [22:19<1:38:15, 14.96s/it] 18%|█▊ | 85/478 [22:34<1:38:00, 14.96s/it] {'loss': '0.1772', 'grad_norm': '5.344', 'learning_rate': '1.964e-05', 'ppl': '1.194', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '19.51', 'tokens/total': 2720000, 'tokens/trainable': 53106, 'epoch': '0.3542'}
18%|█▊ | 85/478 [22:34<1:38:00, 14.96s/it] 18%|█▊ | 86/478 [22:49<1:37:47, 14.97s/it] {'loss': '0.2812', 'grad_norm': '7.031', 'learning_rate': '1.962e-05', 'ppl': '1.325', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '17.42', 'tokens/total': 2752000, 'tokens/trainable': 53627, 'epoch': '0.3583'}
18%|█▊ | 86/478 [22:49<1:37:47, 14.97s/it] 18%|█▊ | 87/478 [23:04<1:37:32, 14.97s/it] {'loss': '0.1904', 'grad_norm': '5.344', 'learning_rate': '1.96e-05', 'ppl': '1.21', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.8', 'tokens/total': 2784000, 'tokens/trainable': 54249, 'epoch': '0.3625'}
18%|█▊ | 87/478 [23:04<1:37:32, 14.97s/it] 18%|█▊ | 88/478 [23:19<1:37:16, 14.97s/it] {'loss': '0.2026', 'grad_norm': '5.344', 'learning_rate': '1.958e-05', 'ppl': '1.225', 'memory/max_active (GiB)': '45.17', 'memory/max_allocated (GiB)': '45.17', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '23.22', 'tokens/total': 2816000, 'tokens/trainable': 54943, 'epoch': '0.3667'}
18%|█▊ | 88/478 [23:19<1:37:16, 14.97s/it] 19%|█▊ | 89/478 [23:34<1:37:02, 14.97s/it] {'loss': '0.23', 'grad_norm': '7.156', 'learning_rate': '1.956e-05', 'ppl': '1.259', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '21.03', 'tokens/total': 2848000, 'tokens/trainable': 55572, 'epoch': '0.3708'}
19%|█▊ | 89/478 [23:34<1:37:02, 14.97s/it] 19%|█▉ | 90/478 [23:49<1:36:47, 14.97s/it] {'loss': '0.2236', 'grad_norm': '5.625', 'learning_rate': '1.954e-05', 'ppl': '1.251', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '23.91', 'tokens/total': 2880000, 'tokens/trainable': 56287, 'epoch': '0.375'}
19%|█▉ | 90/478 [23:49<1:36:47, 14.97s/it] 19%|█▉ | 91/478 [24:04<1:36:32, 14.97s/it] {'loss': '0.2427', 'grad_norm': '6.5', 'learning_rate': '1.951e-05', 'ppl': '1.275', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.5', 'tokens/total': 2912000, 'tokens/trainable': 56900, 'epoch': '0.3792'}
19%|█▉ | 91/478 [24:04<1:36:32, 14.97s/it] 19%|█▉ | 92/478 [24:19<1:36:17, 14.97s/it] {'loss': '0.2754', 'grad_norm': '6.562', 'learning_rate': '1.949e-05', 'ppl': '1.317', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '18.6', 'tokens/total': 2944000, 'tokens/trainable': 57456, 'epoch': '0.3833'}
19%|█▉ | 92/478 [24:19<1:36:17, 14.97s/it] 19%|█▉ | 93/478 [24:34<1:36:02, 14.97s/it] {'loss': '0.2202', 'grad_norm': '5.438', 'learning_rate': '1.947e-05', 'ppl': '1.246', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '21.98', 'tokens/total': 2976000, 'tokens/trainable': 58113, 'epoch': '0.3875'}
19%|█▉ | 93/478 [24:34<1:36:02, 14.97s/it] 20%|█▉ | 94/478 [24:49<1:35:46, 14.97s/it] {'loss': '0.1865', 'grad_norm': '5.844', 'learning_rate': '1.944e-05', 'ppl': '1.205', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '17.47', 'tokens/total': 3008000, 'tokens/trainable': 58635, 'epoch': '0.3917'}
20%|█▉ | 94/478 [24:49<1:35:46, 14.97s/it] 20%|█▉ | 95/478 [25:04<1:35:33, 14.97s/it] {'loss': '0.2656', 'grad_norm': '6.562', 'learning_rate': '1.942e-05', 'ppl': '1.304', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '16.09', 'tokens/total': 3040000, 'tokens/trainable': 59116, 'epoch': '0.3958'}
20%|█▉ | 95/478 [25:04<1:35:33, 14.97s/it] 20%|██ | 96/478 [25:19<1:35:18, 14.97s/it] {'loss': '0.1924', 'grad_norm': '6.875', 'learning_rate': '1.939e-05', 'ppl': '1.212', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '16.69', 'tokens/total': 3072000, 'tokens/trainable': 59615, 'epoch': '0.4'}
20%|██ | 96/478 [25:19<1:35:18, 14.97s/it] 20%|██ | 97/478 [25:34<1:35:04, 14.97s/it] {'loss': '0.2271', 'grad_norm': '7.094', 'learning_rate': '1.937e-05', 'ppl': '1.255', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.66', 'tokens/total': 3104000, 'tokens/trainable': 60233, 'epoch': '0.4042'}
20%|██ | 97/478 [25:34<1:35:04, 14.97s/it] 21%|██ | 98/478 [25:49<1:34:49, 14.97s/it] {'loss': '0.2163', 'grad_norm': '5.062', 'learning_rate': '1.934e-05', 'ppl': '1.241', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.87', 'tokens/total': 3136000, 'tokens/trainable': 60857, 'epoch': '0.4083'}
21%|██ | 98/478 [25:49<1:34:49, 14.97s/it] 21%|██ | 99/478 [26:03<1:34:33, 14.97s/it] {'loss': '0.1621', 'grad_norm': '4.938', 'learning_rate': '1.932e-05', 'ppl': '1.176', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.88', 'tokens/total': 3168000, 'tokens/trainable': 61541, 'epoch': '0.4125'}
21%|██ | 99/478 [26:04<1:34:33, 14.97s/it] 21%|██ | 100/478 [26:18<1:34:18, 14.97s/it] {'loss': '0.2051', 'grad_norm': '5.219', 'learning_rate': '1.929e-05', 'ppl': '1.228', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.57', 'tokens/total': 3200000, 'tokens/trainable': 62156, 'epoch': '0.4167'}
21%|██ | 100/478 [26:18<1:34:18, 14.97s/it] 21%|██ | 101/478 [26:33<1:34:02, 14.97s/it] {'loss': '0.2344', 'grad_norm': '6.625', 'learning_rate': '1.926e-05', 'ppl': '1.264', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '18.6', 'tokens/total': 3232000, 'tokens/trainable': 62712, 'epoch': '0.4208'}
21%|██ | 101/478 [26:33<1:34:02, 14.97s/it] 21%|██▏ | 102/478 [26:48<1:33:46, 14.96s/it] {'loss': '0.1924', 'grad_norm': '5.812', 'learning_rate': '1.924e-05', 'ppl': '1.212', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '21.25', 'tokens/total': 3264000, 'tokens/trainable': 63347, 'epoch': '0.425'}
21%|██▏ | 102/478 [26:48<1:33:46, 14.96s/it] 22%|██▏ | 103/478 [27:03<1:33:31, 14.96s/it] {'loss': '0.1865', 'grad_norm': '5', 'learning_rate': '1.921e-05', 'ppl': '1.205', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '19.4', 'tokens/total': 3296000, 'tokens/trainable': 63927, 'epoch': '0.4292'}
22%|██▏ | 103/478 [27:03<1:33:31, 14.96s/it] 22%|██▏ | 104/478 [27:18<1:33:17, 14.97s/it] {'loss': '0.2471', 'grad_norm': '5.438', 'learning_rate': '1.918e-05', 'ppl': '1.28', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.87', 'tokens/total': 3328000, 'tokens/trainable': 64551, 'epoch': '0.4333'}
22%|██▏ | 104/478 [27:18<1:33:17, 14.97s/it] 22%|██▏ | 105/478 [27:33<1:33:02, 14.97s/it] {'loss': '0.1685', 'grad_norm': '5.125', 'learning_rate': '1.915e-05', 'ppl': '1.183', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '21.44', 'tokens/total': 3360000, 'tokens/trainable': 65192, 'epoch': '0.4375'}
22%|██▏ | 105/478 [27:33<1:33:02, 14.97s/it] 22%|██▏ | 106/478 [27:48<1:32:47, 14.97s/it] {'loss': '0.1958', 'grad_norm': '5.219', 'learning_rate': '1.912e-05', 'ppl': '1.216', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '23.29', 'tokens/total': 3392000, 'tokens/trainable': 65888, 'epoch': '0.4417'}
22%|██▏ | 106/478 [27:48<1:32:47, 14.97s/it] 22%|██▏ | 107/478 [28:03<1:32:32, 14.97s/it] {'loss': '0.2749', 'grad_norm': '6.094', 'learning_rate': '1.909e-05', 'ppl': '1.316', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.91', 'tokens/total': 3424000, 'tokens/trainable': 66513, 'epoch': '0.4458'}
22%|██▏ | 107/478 [28:03<1:32:32, 14.97s/it] 23%|██▎ | 108/478 [28:18<1:32:17, 14.97s/it] {'loss': '0.1978', 'grad_norm': '4.844', 'learning_rate': '1.906e-05', 'ppl': '1.219', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.4', 'tokens/total': 3456000, 'tokens/trainable': 67123, 'epoch': '0.45'}
23%|██▎ | 108/478 [28:18<1:32:17, 14.97s/it] 23%|██▎ | 109/478 [28:33<1:32:02, 14.97s/it] {'loss': '0.2812', 'grad_norm': '6.406', 'learning_rate': '1.903e-05', 'ppl': '1.325', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '18.87', 'tokens/total': 3488000, 'tokens/trainable': 67687, 'epoch': '0.4542'}
23%|██▎ | 109/478 [28:33<1:32:02, 14.97s/it] 23%|██▎ | 110/478 [28:48<1:31:47, 14.97s/it] {'loss': '0.2578', 'grad_norm': '6.062', 'learning_rate': '1.9e-05', 'ppl': '1.294', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '21.61', 'tokens/total': 3520000, 'tokens/trainable': 68333, 'epoch': '0.4583'}
23%|██▎ | 110/478 [28:48<1:31:47, 14.97s/it] 23%|██▎ | 111/478 [29:03<1:31:39, 14.99s/it] {'loss': '0.1865', 'grad_norm': '4.438', 'learning_rate': '1.896e-05', 'ppl': '1.205', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '21.88', 'tokens/total': 3552000, 'tokens/trainable': 68990, 'epoch': '0.4625'}
23%|██▎ | 111/478 [29:03<1:31:39, 14.99s/it] 23%|██▎ | 112/478 [29:18<1:31:22, 14.98s/it] {'loss': '0.2842', 'grad_norm': '6.375', 'learning_rate': '1.893e-05', 'ppl': '1.329', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '18.87', 'tokens/total': 3584000, 'tokens/trainable': 69554, 'epoch': '0.4667'}
23%|██▎ | 112/478 [29:18<1:31:22, 14.98s/it] 24%|██▎ | 113/478 [29:33<1:30:58, 14.96s/it] {'loss': '0.29', 'grad_norm': '7.375', 'learning_rate': '1.89e-05', 'ppl': '1.336', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '21.7', 'tokens/total': 3616000, 'tokens/trainable': 70200, 'epoch': '0.4708'}
24%|██▎ | 113/478 [29:33<1:30:58, 14.96s/it] 24%|██▍ | 114/478 [29:48<1:30:44, 14.96s/it] {'loss': '0.1782', 'grad_norm': '9.062', 'learning_rate': '1.886e-05', 'ppl': '1.195', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.59', 'tokens/total': 3648000, 'tokens/trainable': 70875, 'epoch': '0.475'}
24%|██▍ | 114/478 [29:48<1:30:44, 14.96s/it] 24%|██▍ | 115/478 [30:03<1:30:30, 14.96s/it] {'loss': '0.2656', 'grad_norm': '5.594', 'learning_rate': '1.883e-05', 'ppl': '1.304', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '20.14', 'tokens/total': 3680000, 'tokens/trainable': 71477, 'epoch': '0.4792'}
24%|██▍ | 115/478 [30:03<1:30:30, 14.96s/it] 24%|██▍ | 116/478 [30:18<1:30:16, 14.96s/it] {'loss': '0.2251', 'grad_norm': '4.688', 'learning_rate': '1.88e-05', 'ppl': '1.252', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '18.03', 'tokens/total': 3712000, 'tokens/trainable': 72016, 'epoch': '0.4833'}
24%|██▍ | 116/478 [30:18<1:30:16, 14.96s/it] 24%|██▍ | 117/478 [30:33<1:30:01, 14.96s/it] {'loss': '0.2319', 'grad_norm': '5.438', 'learning_rate': '1.876e-05', 'ppl': '1.261', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '21.68', 'tokens/total': 3744000, 'tokens/trainable': 72664, 'epoch': '0.4875'}
24%|██▍ | 117/478 [30:33<1:30:01, 14.96s/it] 25%|██▍ | 118/478 [30:48<1:29:49, 14.97s/it] {'loss': '0.1895', 'grad_norm': '5.281', 'learning_rate': '1.873e-05', 'ppl': '1.209', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '21.71', 'tokens/total': 3776000, 'tokens/trainable': 73314, 'epoch': '0.4917'}
25%|██▍ | 118/478 [30:48<1:29:49, 14.97s/it] 25%|██▍ | 119/478 [31:03<1:29:33, 14.97s/it] {'loss': '0.1973', 'grad_norm': '6.719', 'learning_rate': '1.869e-05', 'ppl': '1.218', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.98', 'tokens/total': 3808000, 'tokens/trainable': 74001, 'epoch': '0.4958'}
25%|██▍ | 119/478 [31:03<1:29:33, 14.97s/it] 25%|██▌ | 120/478 [31:18<1:29:18, 14.97s/it] {'loss': '0.2339', 'grad_norm': '4.969', 'learning_rate': '1.865e-05', 'ppl': '1.264', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '59.04', 'tokens/train_per_sec_per_gpu': '22.25', 'tokens/total': 3840000, 'tokens/trainable': 74666, 'epoch': '0.5'}
25%|██▌ | 120/478 [31:18<1:29:18, 14.97s/it][2026-04-17 02:40:42,441] [INFO] [axolotl.core.trainers.base] Running evaluation step...
[2026-04-17 02:40:50,590] [INFO] [axolotl.utils.samplers.multipack] gather_len_batches: [54, 54]
0%| | 0/27 [00:00<?, ?it/s]
7%|▋ | 2/27 [00:02<00:35, 1.41s/it]
11%|█ | 3/27 [00:05<00:47, 1.97s/it]
15%|█▍ | 4/27 [00:08<00:51, 2.26s/it]
19%|█▊ | 5/27 [00:11<00:53, 2.43s/it]
22%|██▏ | 6/27 [00:13<00:53, 2.54s/it]
26%|██▌ | 7/27 [00:16<00:52, 2.61s/it]
30%|██▉ | 8/27 [00:19<00:50, 2.65s/it]
33%|███▎ | 9/27 [00:22<00:48, 2.68s/it]
37%|███▋ | 10/27 [00:24<00:45, 2.71s/it]
41%|████ | 11/27 [00:27<00:43, 2.72s/it]
44%|████▍ | 12/27 [00:30<00:39, 2.64s/it]
48%|████▊ | 13/27 [00:33<00:38, 2.76s/it]
52%|█████▏ | 14/27 [00:35<00:35, 2.76s/it]
56%|█████▌ | 15/27 [00:38<00:33, 2.76s/it]
59%|█████▉ | 16/27 [00:41<00:30, 2.76s/it]
63%|██████▎ | 17/27 [00:44<00:27, 2.76s/it]
67%|██████▋ | 18/27 [00:46<00:24, 2.76s/it]
70%|███████ | 19/27 [00:49<00:22, 2.76s/it]
74%|███████▍ | 20/27 [00:52<00:19, 2.75s/it]
78%|███████▊ | 21/27 [00:54<00:16, 2.67s/it]
81%|████████▏ | 22/27 [00:57<00:13, 2.78s/it]
85%|████████▌ | 23/27 [01:00<00:11, 2.77s/it]
89%|████████▉ | 24/27 [01:03<00:08, 2.77s/it]
93%|█████████▎| 25/27 [01:06<00:05, 2.76s/it]
96%|█████████▋| 26/27 [01:08<00:02, 2.76s/it]
100%|██████████| 27/27 [01:11<00:00, 2.79s/it]
{'eval_loss': '0.2288', 'eval_runtime': '75.12', 'eval_samples_per_second': '2.782', 'eval_steps_per_second': '1.398', 'eval_ppl': '1.257', 'memory/max_active (GiB)': '34.91', 'memory/max_allocated (GiB)': '34.91', 'memory/device_reserved (GiB)': '59.04', 'epoch': '0.5', 'tokens/train_per_sec_per_gpu': '0'}
25%|██▌ | 120/478 [32:41<1:29:18, 14.97s/it]
100%|██████████| 27/27 [01:13<00:00, 2.79s/it]
25%|██▌ | 121/478 [32:56<3:57:42, 39.95s/it] {'loss': '0.2651', 'grad_norm': '6.125', 'learning_rate': '1.862e-05', 'ppl': '1.304', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '17.61', 'tokens/total': 3872000, 'tokens/trainable': 75192, 'epoch': '0.5042'}
25%|██▌ | 121/478 [32:56<3:57:42, 39.95s/it] 26%|██▌ | 122/478 [33:11<3:12:32, 32.45s/it] {'loss': '0.2109', 'grad_norm': '5.812', 'learning_rate': '1.858e-05', 'ppl': '1.235', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.68', 'tokens/total': 3904000, 'tokens/trainable': 75780, 'epoch': '0.5083'}
26%|██▌ | 122/478 [33:11<3:12:32, 32.45s/it] 26%|██▌ | 123/478 [33:26<2:40:56, 27.20s/it] {'loss': '0.2168', 'grad_norm': '5.438', 'learning_rate': '1.854e-05', 'ppl': '1.242', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '17.31', 'tokens/total': 3936000, 'tokens/trainable': 76297, 'epoch': '0.5125'}
26%|██▌ | 123/478 [33:26<2:40:56, 27.20s/it] 26%|██▌ | 124/478 [33:41<2:18:48, 23.53s/it] {'loss': '0.2153', 'grad_norm': '5.344', 'learning_rate': '1.85e-05', 'ppl': '1.24', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '17.41', 'tokens/total': 3968000, 'tokens/trainable': 76817, 'epoch': '0.5167'}
26%|██▌ | 124/478 [33:41<2:18:48, 23.53s/it] 26%|██▌ | 125/478 [33:56<2:03:16, 20.95s/it] {'loss': '0.2075', 'grad_norm': '8.938', 'learning_rate': '1.847e-05', 'ppl': '1.231', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.12', 'tokens/total': 4000000, 'tokens/trainable': 77418, 'epoch': '0.5208'}
26%|██▌ | 125/478 [33:56<2:03:16, 20.95s/it] 26%|██▋ | 126/478 [34:11<1:52:21, 19.15s/it] {'loss': '0.2148', 'grad_norm': '5.344', 'learning_rate': '1.843e-05', 'ppl': '1.24', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.9', 'tokens/total': 4032000, 'tokens/trainable': 78072, 'epoch': '0.525'}
26%|██▋ | 126/478 [34:11<1:52:21, 19.15s/it] 27%|██▋ | 127/478 [34:26<1:44:40, 17.89s/it] {'loss': '0.2383', 'grad_norm': '5.594', 'learning_rate': '1.839e-05', 'ppl': '1.269', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.32', 'tokens/total': 4064000, 'tokens/trainable': 78709, 'epoch': '0.5292'}
27%|██▋ | 127/478 [34:26<1:44:40, 17.89s/it] 27%|██▋ | 128/478 [34:41<1:39:13, 17.01s/it] {'loss': '0.2544', 'grad_norm': '5.969', 'learning_rate': '1.835e-05', 'ppl': '1.29', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.15', 'tokens/total': 4096000, 'tokens/trainable': 79251, 'epoch': '0.5333'}
27%|██▋ | 128/478 [34:41<1:39:13, 17.01s/it] 27%|██▋ | 129/478 [34:56<1:35:21, 16.39s/it] {'loss': '0.2046', 'grad_norm': '5.156', 'learning_rate': '1.831e-05', 'ppl': '1.227', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.95', 'tokens/total': 4128000, 'tokens/trainable': 79847, 'epoch': '0.5375'}
27%|██▋ | 129/478 [34:56<1:35:21, 16.39s/it] 27%|██▋ | 130/478 [35:11<1:32:35, 15.96s/it] {'loss': '0.2026', 'grad_norm': '4.969', 'learning_rate': '1.827e-05', 'ppl': '1.225', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.98', 'tokens/total': 4160000, 'tokens/trainable': 80444, 'epoch': '0.5417'}
27%|██▋ | 130/478 [35:11<1:32:35, 15.96s/it] 27%|██▋ | 131/478 [35:26<1:30:33, 15.66s/it] {'loss': '0.2222', 'grad_norm': '5.688', 'learning_rate': '1.823e-05', 'ppl': '1.249', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '17.91', 'tokens/total': 4192000, 'tokens/trainable': 80979, 'epoch': '0.5458'}
27%|██▋ | 131/478 [35:26<1:30:33, 15.66s/it] 28%|██▊ | 132/478 [35:41<1:29:04, 15.45s/it] {'loss': '0.2271', 'grad_norm': '5.656', 'learning_rate': '1.818e-05', 'ppl': '1.255', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.49', 'tokens/total': 4224000, 'tokens/trainable': 81531, 'epoch': '0.55'}
28%|██▊ | 132/478 [35:41<1:29:04, 15.45s/it] 28%|██▊ | 133/478 [35:55<1:27:55, 15.29s/it] {'loss': '0.2725', 'grad_norm': '4.938', 'learning_rate': '1.814e-05', 'ppl': '1.313', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '23.24', 'tokens/total': 4256000, 'tokens/trainable': 82224, 'epoch': '0.5542'}
28%|██▊ | 133/478 [35:55<1:27:55, 15.29s/it] 28%|██▊ | 134/478 [36:10<1:27:05, 15.19s/it] {'loss': '0.2539', 'grad_norm': '6.031', 'learning_rate': '1.81e-05', 'ppl': '1.289', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '23.04', 'tokens/total': 4288000, 'tokens/trainable': 82912, 'epoch': '0.5583'}
28%|██▊ | 134/478 [36:10<1:27:05, 15.19s/it] 28%|██▊ | 135/478 [36:25<1:26:25, 15.12s/it] {'loss': '0.2368', 'grad_norm': '5.312', 'learning_rate': '1.806e-05', 'ppl': '1.267', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.44', 'tokens/total': 4320000, 'tokens/trainable': 83582, 'epoch': '0.5625'}
28%|██▊ | 135/478 [36:25<1:26:25, 15.12s/it] 28%|██▊ | 136/478 [36:40<1:25:53, 15.07s/it] {'loss': '0.2617', 'grad_norm': '7.75', 'learning_rate': '1.801e-05', 'ppl': '1.299', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.19', 'tokens/total': 4352000, 'tokens/trainable': 84185, 'epoch': '0.5667'}
28%|██▊ | 136/478 [36:40<1:25:53, 15.07s/it] 29%|██▊ | 137/478 [36:55<1:25:24, 15.03s/it] {'loss': '0.2041', 'grad_norm': '4.906', 'learning_rate': '1.797e-05', 'ppl': '1.226', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.79', 'tokens/total': 4384000, 'tokens/trainable': 84805, 'epoch': '0.5708'}
29%|██▊ | 137/478 [36:55<1:25:24, 15.03s/it] 29%|██▉ | 138/478 [37:10<1:25:01, 15.01s/it] {'loss': '0.2881', 'grad_norm': '7.125', 'learning_rate': '1.792e-05', 'ppl': '1.334', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.32', 'tokens/total': 4416000, 'tokens/trainable': 85382, 'epoch': '0.575'}
29%|██▉ | 138/478 [37:10<1:25:01, 15.01s/it] 29%|██▉ | 139/478 [37:25<1:24:41, 14.99s/it] {'loss': '0.1733', 'grad_norm': '4.5', 'learning_rate': '1.788e-05', 'ppl': '1.189', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.12', 'tokens/total': 4448000, 'tokens/trainable': 85983, 'epoch': '0.5792'}
29%|██▉ | 139/478 [37:25<1:24:41, 14.99s/it] 29%|██▉ | 140/478 [37:40<1:24:22, 14.98s/it] {'loss': '0.2285', 'grad_norm': '5.844', 'learning_rate': '1.783e-05', 'ppl': '1.257', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '16.68', 'tokens/total': 4480000, 'tokens/trainable': 86481, 'epoch': '0.5833'}
29%|██▉ | 140/478 [37:40<1:24:22, 14.98s/it] 29%|██▉ | 141/478 [37:55<1:23:58, 14.95s/it] {'loss': '0.2144', 'grad_norm': '5.438', 'learning_rate': '1.779e-05', 'ppl': '1.239', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.69', 'tokens/total': 4512000, 'tokens/trainable': 87126, 'epoch': '0.5875'}
29%|██▉ | 141/478 [37:55<1:23:58, 14.95s/it] 30%|██▉ | 142/478 [38:10<1:23:42, 14.95s/it] {'loss': '0.2754', 'grad_norm': '6.188', 'learning_rate': '1.774e-05', 'ppl': '1.317', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.84', 'tokens/total': 4544000, 'tokens/trainable': 87778, 'epoch': '0.5917'}
30%|██▉ | 142/478 [38:10<1:23:42, 14.95s/it] 30%|██▉ | 143/478 [38:25<1:23:28, 14.95s/it] {'loss': '0.2407', 'grad_norm': '5.375', 'learning_rate': '1.77e-05', 'ppl': '1.272', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.08', 'tokens/total': 4576000, 'tokens/trainable': 88408, 'epoch': '0.5958'}
30%|██▉ | 143/478 [38:25<1:23:28, 14.95s/it] 30%|███ | 144/478 [38:40<1:23:14, 14.95s/it] {'loss': '0.176', 'grad_norm': '5.844', 'learning_rate': '1.765e-05', 'ppl': '1.192', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.75', 'tokens/total': 4608000, 'tokens/trainable': 88968, 'epoch': '0.6'}
30%|███ | 144/478 [38:40<1:23:14, 14.95s/it] 30%|███ | 145/478 [38:55<1:22:58, 14.95s/it] {'loss': '0.1982', 'grad_norm': '4.781', 'learning_rate': '1.76e-05', 'ppl': '1.219', 'memory/max_active (GiB)': '45.17', 'memory/max_allocated (GiB)': '45.17', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.33', 'tokens/total': 4640000, 'tokens/trainable': 89545, 'epoch': '0.6042'}
30%|███ | 145/478 [38:55<1:22:58, 14.95s/it] 31%|███ | 146/478 [39:10<1:22:41, 14.94s/it] {'loss': '0.1758', 'grad_norm': '5.562', 'learning_rate': '1.756e-05', 'ppl': '1.192', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.85', 'tokens/total': 4672000, 'tokens/trainable': 90107, 'epoch': '0.6083'}
31%|███ | 146/478 [39:10<1:22:41, 14.94s/it] 31%|███ | 147/478 [39:25<1:22:27, 14.95s/it] {'loss': '0.2261', 'grad_norm': '5.438', 'learning_rate': '1.751e-05', 'ppl': '1.254', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.66', 'tokens/total': 4704000, 'tokens/trainable': 90724, 'epoch': '0.6125'}
31%|███ | 147/478 [39:25<1:22:27, 14.95s/it] 31%|███ | 148/478 [39:40<1:22:12, 14.95s/it] {'loss': '0.2046', 'grad_norm': '5.438', 'learning_rate': '1.746e-05', 'ppl': '1.227', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.3', 'tokens/total': 4736000, 'tokens/trainable': 91390, 'epoch': '0.6167'}
31%|███ | 148/478 [39:40<1:22:12, 14.95s/it] 31%|███ | 149/478 [39:55<1:21:58, 14.95s/it] {'loss': '0.2109', 'grad_norm': '5.375', 'learning_rate': '1.741e-05', 'ppl': '1.235', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.79', 'tokens/total': 4768000, 'tokens/trainable': 91981, 'epoch': '0.6208'}
31%|███ | 149/478 [39:55<1:21:58, 14.95s/it] 31%|███▏ | 150/478 [40:10<1:21:43, 14.95s/it] {'loss': '0.208', 'grad_norm': '5.812', 'learning_rate': '1.736e-05', 'ppl': '1.231', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.79', 'tokens/total': 4800000, 'tokens/trainable': 92542, 'epoch': '0.625'}
31%|███▏ | 150/478 [40:10<1:21:43, 14.95s/it] 32%|███▏ | 151/478 [40:24<1:21:28, 14.95s/it] {'loss': '0.2383', 'grad_norm': '5.625', 'learning_rate': '1.731e-05', 'ppl': '1.269', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.89', 'tokens/total': 4832000, 'tokens/trainable': 93106, 'epoch': '0.6292'}
32%|███▏ | 151/478 [40:24<1:21:28, 14.95s/it] 32%|███▏ | 152/478 [40:39<1:21:13, 14.95s/it] {'loss': '0.2788', 'grad_norm': '7.312', 'learning_rate': '1.726e-05', 'ppl': '1.322', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.64', 'tokens/total': 4864000, 'tokens/trainable': 93782, 'epoch': '0.6333'}
32%|███▏ | 152/478 [40:39<1:21:13, 14.95s/it] 32%|███▏ | 153/478 [40:54<1:21:02, 14.96s/it] {'loss': '0.2266', 'grad_norm': '5.344', 'learning_rate': '1.721e-05', 'ppl': '1.254', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.77', 'tokens/total': 4896000, 'tokens/trainable': 94464, 'epoch': '0.6375'}
32%|███▏ | 153/478 [40:54<1:21:02, 14.96s/it] 32%|███▏ | 154/478 [41:09<1:20:47, 14.96s/it] {'loss': '0.1919', 'grad_norm': '4.656', 'learning_rate': '1.716e-05', 'ppl': '1.212', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.29', 'tokens/total': 4928000, 'tokens/trainable': 95070, 'epoch': '0.6417'}
32%|███▏ | 154/478 [41:09<1:20:47, 14.96s/it] 32%|███▏ | 155/478 [41:24<1:20:32, 14.96s/it] {'loss': '0.2554', 'grad_norm': '5.844', 'learning_rate': '1.711e-05', 'ppl': '1.291', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.57', 'tokens/total': 4960000, 'tokens/trainable': 95655, 'epoch': '0.6458'}
32%|███▏ | 155/478 [41:24<1:20:32, 14.96s/it] 33%|███▎ | 156/478 [41:39<1:20:14, 14.95s/it] {'loss': '0.2178', 'grad_norm': '4.375', 'learning_rate': '1.706e-05', 'ppl': '1.243', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '23.26', 'tokens/total': 4992000, 'tokens/trainable': 96349, 'epoch': '0.65'}
33%|███▎ | 156/478 [41:39<1:20:14, 14.95s/it] 33%|███▎ | 157/478 [41:54<1:19:59, 14.95s/it] {'loss': '0.2354', 'grad_norm': '4.625', 'learning_rate': '1.701e-05', 'ppl': '1.265', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.3', 'tokens/total': 5024000, 'tokens/trainable': 97015, 'epoch': '0.6542'}
33%|███▎ | 157/478 [41:54<1:19:59, 14.95s/it] 33%|███▎ | 158/478 [42:09<1:19:44, 14.95s/it] {'loss': '0.2778', 'grad_norm': '5.625', 'learning_rate': '1.695e-05', 'ppl': '1.32', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.6', 'tokens/total': 5056000, 'tokens/trainable': 97660, 'epoch': '0.6583'}
33%|███▎ | 158/478 [42:09<1:19:44, 14.95s/it] 33%|███▎ | 159/478 [42:24<1:19:29, 14.95s/it] {'loss': '0.1919', 'grad_norm': '6.25', 'learning_rate': '1.69e-05', 'ppl': '1.212', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.29', 'tokens/total': 5088000, 'tokens/trainable': 98206, 'epoch': '0.6625'}
33%|███▎ | 159/478 [42:24<1:19:29, 14.95s/it] 33%|███▎ | 160/478 [42:39<1:19:15, 14.95s/it] {'loss': '0.209', 'grad_norm': '4.875', 'learning_rate': '1.685e-05', 'ppl': '1.232', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '23.23', 'tokens/total': 5120000, 'tokens/trainable': 98900, 'epoch': '0.6667'}
33%|███▎ | 160/478 [42:39<1:19:15, 14.95s/it] 34%|███▎ | 161/478 [42:54<1:18:59, 14.95s/it] {'loss': '0.2231', 'grad_norm': '5.594', 'learning_rate': '1.68e-05', 'ppl': '1.25', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '17.42', 'tokens/total': 5152000, 'tokens/trainable': 99420, 'epoch': '0.6708'}
34%|███▎ | 161/478 [42:54<1:18:59, 14.95s/it] 34%|███▍ | 162/478 [43:09<1:18:44, 14.95s/it] {'loss': '0.2354', 'grad_norm': '4.938', 'learning_rate': '1.674e-05', 'ppl': '1.265', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.09', 'tokens/total': 5184000, 'tokens/trainable': 100050, 'epoch': '0.675'}
34%|███▍ | 162/478 [43:09<1:18:44, 14.95s/it] 34%|███▍ | 163/478 [43:24<1:18:29, 14.95s/it] {'loss': '0.2231', 'grad_norm': '4.656', 'learning_rate': '1.669e-05', 'ppl': '1.25', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.64', 'tokens/total': 5216000, 'tokens/trainable': 100696, 'epoch': '0.6792'}
34%|███▍ | 163/478 [43:24<1:18:29, 14.95s/it] 34%|███▍ | 164/478 [43:39<1:18:14, 14.95s/it] {'loss': '0.2285', 'grad_norm': '6', 'learning_rate': '1.663e-05', 'ppl': '1.257', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '16.67', 'tokens/total': 5248000, 'tokens/trainable': 101194, 'epoch': '0.6833'}
34%|███▍ | 164/478 [43:39<1:18:14, 14.95s/it] 35%|███▍ | 165/478 [43:54<1:18:00, 14.95s/it] {'loss': '0.2773', 'grad_norm': '5.438', 'learning_rate': '1.658e-05', 'ppl': '1.32', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.59', 'tokens/total': 5280000, 'tokens/trainable': 101869, 'epoch': '0.6875'}
35%|███▍ | 165/478 [43:54<1:18:00, 14.95s/it] 35%|███▍ | 166/478 [44:09<1:17:45, 14.95s/it] {'loss': '0.1777', 'grad_norm': '5.156', 'learning_rate': '1.652e-05', 'ppl': '1.195', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.58', 'tokens/total': 5312000, 'tokens/trainable': 102424, 'epoch': '0.6917'}
35%|███▍ | 166/478 [44:09<1:17:45, 14.95s/it] 35%|███▍ | 167/478 [44:24<1:17:28, 14.95s/it] {'loss': '0.2383', 'grad_norm': '6.719', 'learning_rate': '1.647e-05', 'ppl': '1.269', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.03', 'tokens/total': 5344000, 'tokens/trainable': 103051, 'epoch': '0.6958'}
35%|███▍ | 167/478 [44:24<1:17:28, 14.95s/it] 35%|███▌ | 168/478 [44:39<1:17:13, 14.95s/it] {'loss': '0.2178', 'grad_norm': '5.844', 'learning_rate': '1.641e-05', 'ppl': '1.243', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.22', 'tokens/total': 5376000, 'tokens/trainable': 103625, 'epoch': '0.7'}
35%|███▌ | 168/478 [44:39<1:17:13, 14.95s/it] 35%|███▌ | 169/478 [44:54<1:16:59, 14.95s/it] {'loss': '0.2124', 'grad_norm': '7.688', 'learning_rate': '1.636e-05', 'ppl': '1.237', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.02', 'tokens/total': 5408000, 'tokens/trainable': 104253, 'epoch': '0.7042'}
35%|███▌ | 169/478 [44:54<1:16:59, 14.95s/it] 36%|███▌ | 170/478 [45:09<1:16:45, 14.95s/it] {'loss': '0.1899', 'grad_norm': '5', 'learning_rate': '1.63e-05', 'ppl': '1.209', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.95', 'tokens/total': 5440000, 'tokens/trainable': 104879, 'epoch': '0.7083'}
36%|███▌ | 170/478 [45:09<1:16:45, 14.95s/it] 36%|███▌ | 171/478 [45:24<1:16:30, 14.95s/it] {'loss': '0.2271', 'grad_norm': '5.406', 'learning_rate': '1.624e-05', 'ppl': '1.255', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.79', 'tokens/total': 5472000, 'tokens/trainable': 105500, 'epoch': '0.7125'}
36%|███▌ | 171/478 [45:24<1:16:30, 14.95s/it] 36%|███▌ | 172/478 [45:38<1:16:15, 14.95s/it] {'loss': '0.3076', 'grad_norm': '6', 'learning_rate': '1.619e-05', 'ppl': '1.36', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.07', 'tokens/total': 5504000, 'tokens/trainable': 106159, 'epoch': '0.7167'}
36%|███▌ | 172/478 [45:38<1:16:15, 14.95s/it] 36%|███▌ | 173/478 [45:53<1:15:59, 14.95s/it] {'loss': '0.2109', 'grad_norm': '5.906', 'learning_rate': '1.613e-05', 'ppl': '1.235', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.8', 'tokens/total': 5536000, 'tokens/trainable': 106780, 'epoch': '0.7208'}
36%|███▌ | 173/478 [45:53<1:15:59, 14.95s/it] 36%|███▋ | 174/478 [46:08<1:15:51, 14.97s/it] {'loss': '0.2563', 'grad_norm': '5.219', 'learning_rate': '1.607e-05', 'ppl': '1.292', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '24.75', 'tokens/total': 5568000, 'tokens/trainable': 107522, 'epoch': '0.725'}
36%|███▋ | 174/478 [46:08<1:15:51, 14.97s/it] 37%|███▋ | 175/478 [46:23<1:15:34, 14.97s/it] {'loss': '0.2666', 'grad_norm': '5.25', 'learning_rate': '1.601e-05', 'ppl': '1.306', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21', 'tokens/total': 5600000, 'tokens/trainable': 108149, 'epoch': '0.7292'}
37%|███▋ | 175/478 [46:23<1:15:34, 14.97s/it] 37%|███▋ | 176/478 [46:38<1:15:17, 14.96s/it] {'loss': '0.3096', 'grad_norm': '5.406', 'learning_rate': '1.595e-05', 'ppl': '1.363', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '24.3', 'tokens/total': 5632000, 'tokens/trainable': 108874, 'epoch': '0.7333'}
37%|███▋ | 176/478 [46:38<1:15:17, 14.96s/it] 37%|███▋ | 177/478 [46:53<1:15:02, 14.96s/it] {'loss': '0.2432', 'grad_norm': '6.125', 'learning_rate': '1.59e-05', 'ppl': '1.275', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.38', 'tokens/total': 5664000, 'tokens/trainable': 109483, 'epoch': '0.7375'}
37%|███▋ | 177/478 [46:53<1:15:02, 14.96s/it] 37%|███▋ | 178/478 [47:08<1:14:46, 14.96s/it] {'loss': '0.1968', 'grad_norm': '4.75', 'learning_rate': '1.584e-05', 'ppl': '1.217', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.87', 'tokens/total': 5696000, 'tokens/trainable': 110166, 'epoch': '0.7417'}
37%|███▋ | 178/478 [47:08<1:14:46, 14.96s/it] 37%|███▋ | 179/478 [47:23<1:14:31, 14.96s/it] {'loss': '0.1934', 'grad_norm': '5.656', 'learning_rate': '1.578e-05', 'ppl': '1.213', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '17.58', 'tokens/total': 5728000, 'tokens/trainable': 110691, 'epoch': '0.7458'}
37%|███▋ | 179/478 [47:23<1:14:31, 14.96s/it] 38%|███▊ | 180/478 [47:38<1:14:16, 14.95s/it] {'loss': '0.2036', 'grad_norm': '4.781', 'learning_rate': '1.572e-05', 'ppl': '1.226', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.46', 'tokens/total': 5760000, 'tokens/trainable': 111272, 'epoch': '0.75'}
38%|███▊ | 180/478 [47:38<1:14:16, 14.95s/it] 38%|███▊ | 181/478 [47:53<1:14:01, 14.95s/it] {'loss': '0.2769', 'grad_norm': '6.219', 'learning_rate': '1.566e-05', 'ppl': '1.319', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.86', 'tokens/total': 5792000, 'tokens/trainable': 111895, 'epoch': '0.7542'}
38%|███▊ | 181/478 [47:53<1:14:01, 14.95s/it] 38%|███▊ | 182/478 [48:08<1:13:46, 14.95s/it] {'loss': '0.1821', 'grad_norm': '4.594', 'learning_rate': '1.56e-05', 'ppl': '1.2', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.58', 'tokens/total': 5824000, 'tokens/trainable': 112450, 'epoch': '0.7583'}
38%|███▊ | 182/478 [48:08<1:13:46, 14.95s/it] 38%|███▊ | 183/478 [48:23<1:13:31, 14.95s/it] {'loss': '0.2163', 'grad_norm': '5.031', 'learning_rate': '1.554e-05', 'ppl': '1.241', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.02', 'tokens/total': 5856000, 'tokens/trainable': 113018, 'epoch': '0.7625'}
38%|███▊ | 183/478 [48:23<1:13:31, 14.95s/it] 38%|███▊ | 184/478 [48:38<1:13:15, 14.95s/it] {'loss': '0.1885', 'grad_norm': '4.406', 'learning_rate': '1.548e-05', 'ppl': '1.207', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.1', 'tokens/total': 5888000, 'tokens/trainable': 113648, 'epoch': '0.7667'}
38%|███▊ | 184/478 [48:38<1:13:15, 14.95s/it] 39%|███▊ | 185/478 [48:53<1:13:01, 14.95s/it] {'loss': '0.1938', 'grad_norm': '5.812', 'learning_rate': '1.541e-05', 'ppl': '1.214', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '17.74', 'tokens/total': 5920000, 'tokens/trainable': 114178, 'epoch': '0.7708'}
39%|███▊ | 185/478 [48:53<1:13:01, 14.95s/it] 39%|███▉ | 186/478 [49:08<1:12:46, 14.95s/it] {'loss': '0.2041', 'grad_norm': '6.5', 'learning_rate': '1.535e-05', 'ppl': '1.226', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.26', 'tokens/total': 5952000, 'tokens/trainable': 114843, 'epoch': '0.775'}
39%|███▉ | 186/478 [49:08<1:12:46, 14.95s/it] 39%|███▉ | 187/478 [49:23<1:12:31, 14.95s/it] {'loss': '0.2329', 'grad_norm': '6.531', 'learning_rate': '1.529e-05', 'ppl': '1.262', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.22', 'tokens/total': 5984000, 'tokens/trainable': 115447, 'epoch': '0.7792'}
39%|███▉ | 187/478 [49:23<1:12:31, 14.95s/it] 39%|███▉ | 188/478 [49:38<1:12:16, 14.95s/it] {'loss': '0.1675', 'grad_norm': '4.219', 'learning_rate': '1.523e-05', 'ppl': '1.182', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.43', 'tokens/total': 6016000, 'tokens/trainable': 116087, 'epoch': '0.7833'}
39%|███▉ | 188/478 [49:38<1:12:16, 14.95s/it] 40%|███▉ | 189/478 [49:53<1:12:01, 14.95s/it] {'loss': '0.1831', 'grad_norm': '6.25', 'learning_rate': '1.517e-05', 'ppl': '1.201', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.99', 'tokens/total': 6048000, 'tokens/trainable': 116714, 'epoch': '0.7875'}
40%|███▉ | 189/478 [49:53<1:12:01, 14.95s/it] 40%|███▉ | 190/478 [50:08<1:11:47, 14.96s/it] {'loss': '0.2251', 'grad_norm': '6', 'learning_rate': '1.51e-05', 'ppl': '1.252', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.75', 'tokens/total': 6080000, 'tokens/trainable': 117364, 'epoch': '0.7917'}
40%|███▉ | 190/478 [50:08<1:11:47, 14.96s/it] 40%|███▉ | 191/478 [50:23<1:11:33, 14.96s/it] {'loss': '0.2207', 'grad_norm': '5.125', 'learning_rate': '1.504e-05', 'ppl': '1.247', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '23', 'tokens/total': 6112000, 'tokens/trainable': 118052, 'epoch': '0.7958'}
40%|███▉ | 191/478 [50:23<1:11:33, 14.96s/it] 40%|████ | 192/478 [50:38<1:11:18, 14.96s/it] {'loss': '0.2251', 'grad_norm': '5.281', 'learning_rate': '1.498e-05', 'ppl': '1.252', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.53', 'tokens/total': 6144000, 'tokens/trainable': 118695, 'epoch': '0.8'}
40%|████ | 192/478 [50:38<1:11:18, 14.96s/it] 40%|████ | 193/478 [50:53<1:11:06, 14.97s/it] {'loss': '0.1963', 'grad_norm': '4.969', 'learning_rate': '1.492e-05', 'ppl': '1.217', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.1', 'tokens/total': 6176000, 'tokens/trainable': 119357, 'epoch': '0.8042'}
40%|████ | 193/478 [50:53<1:11:06, 14.97s/it] 41%|████ | 194/478 [51:08<1:10:44, 14.95s/it] {'loss': '0.2544', 'grad_norm': '6.562', 'learning_rate': '1.485e-05', 'ppl': '1.29', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.29', 'tokens/total': 6208000, 'tokens/trainable': 120020, 'epoch': '0.8083'}
41%|████ | 194/478 [51:08<1:10:44, 14.95s/it] 41%|████ | 195/478 [51:22<1:10:29, 14.95s/it] {'loss': '0.1855', 'grad_norm': '4.75', 'learning_rate': '1.479e-05', 'ppl': '1.204', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.68', 'tokens/total': 6240000, 'tokens/trainable': 120697, 'epoch': '0.8125'}
41%|████ | 195/478 [51:22<1:10:29, 14.95s/it] 41%|████ | 196/478 [51:37<1:10:13, 14.94s/it] {'loss': '0.2305', 'grad_norm': '4.844', 'learning_rate': '1.472e-05', 'ppl': '1.259', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.5', 'tokens/total': 6272000, 'tokens/trainable': 121368, 'epoch': '0.8167'}
41%|████ | 196/478 [51:37<1:10:13, 14.94s/it] 41%|████ | 197/478 [51:52<1:10:00, 14.95s/it] {'loss': '0.2656', 'grad_norm': '5.969', 'learning_rate': '1.466e-05', 'ppl': '1.304', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.08', 'tokens/total': 6304000, 'tokens/trainable': 121938, 'epoch': '0.8208'}
41%|████ | 197/478 [51:52<1:10:00, 14.95s/it] 41%|████▏ | 198/478 [52:07<1:09:45, 14.95s/it] {'loss': '0.2202', 'grad_norm': '6.125', 'learning_rate': '1.46e-05', 'ppl': '1.246', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.7', 'tokens/total': 6336000, 'tokens/trainable': 122586, 'epoch': '0.825'}
41%|████▏ | 198/478 [52:07<1:09:45, 14.95s/it] 42%|████▏ | 199/478 [52:22<1:09:30, 14.95s/it] {'loss': '0.2485', 'grad_norm': '7.156', 'learning_rate': '1.453e-05', 'ppl': '1.282', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.35', 'tokens/total': 6368000, 'tokens/trainable': 123164, 'epoch': '0.8292'}
42%|████▏ | 199/478 [52:22<1:09:30, 14.95s/it] 42%|████▏ | 200/478 [52:37<1:09:16, 14.95s/it] {'loss': '0.2705', 'grad_norm': '6.5', 'learning_rate': '1.447e-05', 'ppl': '1.311', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.86', 'tokens/total': 6400000, 'tokens/trainable': 123787, 'epoch': '0.8333'}
42%|████▏ | 200/478 [52:37<1:09:16, 14.95s/it] 42%|████▏ | 201/478 [52:52<1:09:01, 14.95s/it] {'loss': '0.2153', 'grad_norm': '5.312', 'learning_rate': '1.44e-05', 'ppl': '1.24', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.86', 'tokens/total': 6432000, 'tokens/trainable': 124410, 'epoch': '0.8375'}
42%|████▏ | 201/478 [52:52<1:09:01, 14.95s/it] 42%|████▏ | 202/478 [53:07<1:08:46, 14.95s/it] {'loss': '0.2012', 'grad_norm': '5.531', 'learning_rate': '1.433e-05', 'ppl': '1.223', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.95', 'tokens/total': 6464000, 'tokens/trainable': 124976, 'epoch': '0.8417'}
42%|████▏ | 202/478 [53:07<1:08:46, 14.95s/it] 42%|████▏ | 203/478 [53:22<1:08:32, 14.95s/it] {'loss': '0.2217', 'grad_norm': '5.812', 'learning_rate': '1.427e-05', 'ppl': '1.248', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.09', 'tokens/total': 6496000, 'tokens/trainable': 125606, 'epoch': '0.8458'}
42%|████▏ | 203/478 [53:22<1:08:32, 14.95s/it] 43%|████▎ | 204/478 [53:37<1:08:17, 14.95s/it] {'loss': '0.2534', 'grad_norm': '5.469', 'learning_rate': '1.42e-05', 'ppl': '1.288', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '23.2', 'tokens/total': 6528000, 'tokens/trainable': 126299, 'epoch': '0.85'}
43%|████▎ | 204/478 [53:37<1:08:17, 14.95s/it] 43%|████▎ | 205/478 [53:52<1:08:00, 14.95s/it] {'loss': '0.1675', 'grad_norm': '4.281', 'learning_rate': '1.414e-05', 'ppl': '1.182', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.46', 'tokens/total': 6560000, 'tokens/trainable': 126939, 'epoch': '0.8542'}
43%|████▎ | 205/478 [53:52<1:08:00, 14.95s/it] 43%|████▎ | 206/478 [54:07<1:07:45, 14.95s/it] {'loss': '0.1685', 'grad_norm': '4.281', 'learning_rate': '1.407e-05', 'ppl': '1.183', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.8', 'tokens/total': 6592000, 'tokens/trainable': 127590, 'epoch': '0.8583'}
43%|████▎ | 206/478 [54:07<1:07:45, 14.95s/it] 43%|████▎ | 207/478 [54:22<1:07:31, 14.95s/it] {'loss': '0.229', 'grad_norm': '5.531', 'learning_rate': '1.4e-05', 'ppl': '1.257', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '23.74', 'tokens/total': 6624000, 'tokens/trainable': 128299, 'epoch': '0.8625'}
43%|████▎ | 207/478 [54:22<1:07:31, 14.95s/it] 44%|████▎ | 208/478 [54:37<1:07:16, 14.95s/it] {'loss': '0.1836', 'grad_norm': '7.562', 'learning_rate': '1.394e-05', 'ppl': '1.202', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.96', 'tokens/total': 6656000, 'tokens/trainable': 128895, 'epoch': '0.8667'}
44%|████▎ | 208/478 [54:37<1:07:16, 14.95s/it] 44%|████▎ | 209/478 [54:52<1:06:56, 14.93s/it] {'loss': '0.2495', 'grad_norm': '5.031', 'learning_rate': '1.387e-05', 'ppl': '1.283', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '23.14', 'tokens/total': 6688000, 'tokens/trainable': 129583, 'epoch': '0.8708'}
44%|████▎ | 209/478 [54:52<1:06:56, 14.93s/it] 44%|████▍ | 210/478 [55:07<1:06:43, 14.94s/it] {'loss': '0.1885', 'grad_norm': '4.531', 'learning_rate': '1.38e-05', 'ppl': '1.207', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.06', 'tokens/total': 6720000, 'tokens/trainable': 130212, 'epoch': '0.875'}
44%|████▍ | 210/478 [55:07<1:06:43, 14.94s/it] 44%|████▍ | 211/478 [55:22<1:07:09, 15.09s/it] {'loss': '0.1689', 'grad_norm': '4.281', 'learning_rate': '1.373e-05', 'ppl': '1.184', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.63', 'tokens/total': 6752000, 'tokens/trainable': 130880, 'epoch': '0.8792'}
44%|████▍ | 211/478 [55:22<1:07:09, 15.09s/it] 44%|████▍ | 212/478 [55:37<1:06:38, 15.03s/it] {'loss': '0.2295', 'grad_norm': '4.906', 'learning_rate': '1.367e-05', 'ppl': '1.258', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.62', 'tokens/total': 6784000, 'tokens/trainable': 131523, 'epoch': '0.8833'}
44%|████▍ | 212/478 [55:37<1:06:38, 15.03s/it] 45%|████▍ | 213/478 [55:52<1:06:16, 15.00s/it] {'loss': '0.2222', 'grad_norm': '4.969', 'learning_rate': '1.36e-05', 'ppl': '1.249', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.21', 'tokens/total': 6816000, 'tokens/trainable': 132186, 'epoch': '0.8875'}
45%|████▍ | 213/478 [55:52<1:06:16, 15.00s/it] 45%|████▍ | 214/478 [56:07<1:05:56, 14.99s/it] {'loss': '0.2119', 'grad_norm': '5', 'learning_rate': '1.353e-05', 'ppl': '1.236', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '16.88', 'tokens/total': 6848000, 'tokens/trainable': 132690, 'epoch': '0.8917'}
45%|████▍ | 214/478 [56:07<1:05:56, 14.99s/it] 45%|████▍ | 215/478 [56:22<1:05:38, 14.98s/it] {'loss': '0.1875', 'grad_norm': '5.219', 'learning_rate': '1.346e-05', 'ppl': '1.206', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.29', 'tokens/total': 6880000, 'tokens/trainable': 133296, 'epoch': '0.8958'}
45%|████▍ | 215/478 [56:22<1:05:38, 14.98s/it] 45%|████▌ | 216/478 [56:37<1:05:21, 14.97s/it] {'loss': '0.2109', 'grad_norm': '4.906', 'learning_rate': '1.339e-05', 'ppl': '1.235', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.47', 'tokens/total': 6912000, 'tokens/trainable': 133937, 'epoch': '0.9'}
45%|████▌ | 216/478 [56:37<1:05:21, 14.97s/it] 45%|████▌ | 217/478 [56:52<1:05:04, 14.96s/it] {'loss': '0.2026', 'grad_norm': '5.781', 'learning_rate': '1.332e-05', 'ppl': '1.225', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.3', 'tokens/total': 6944000, 'tokens/trainable': 134573, 'epoch': '0.9042'}
45%|████▌ | 217/478 [56:52<1:05:04, 14.96s/it] 46%|████▌ | 218/478 [57:07<1:04:47, 14.95s/it] {'loss': '0.1709', 'grad_norm': '3.938', 'learning_rate': '1.326e-05', 'ppl': '1.186', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.43', 'tokens/total': 6976000, 'tokens/trainable': 135242, 'epoch': '0.9083'}
46%|████▌ | 218/478 [57:07<1:04:47, 14.95s/it] 46%|████▌ | 219/478 [57:22<1:04:32, 14.95s/it] {'loss': '0.1785', 'grad_norm': '4.594', 'learning_rate': '1.319e-05', 'ppl': '1.195', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.04', 'tokens/total': 7008000, 'tokens/trainable': 135900, 'epoch': '0.9125'}
46%|████▌ | 219/478 [57:22<1:04:32, 14.95s/it] 46%|████▌ | 220/478 [57:37<1:04:17, 14.95s/it] {'loss': '0.188', 'grad_norm': '4.688', 'learning_rate': '1.312e-05', 'ppl': '1.207', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.67', 'tokens/total': 7040000, 'tokens/trainable': 136577, 'epoch': '0.9167'}
46%|████▌ | 220/478 [57:37<1:04:17, 14.95s/it] 46%|████▌ | 221/478 [57:51<1:04:00, 14.94s/it] {'loss': '0.2163', 'grad_norm': '5.906', 'learning_rate': '1.305e-05', 'ppl': '1.241', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '23.88', 'tokens/total': 7072000, 'tokens/trainable': 137289, 'epoch': '0.9208'}
46%|████▌ | 221/478 [57:51<1:04:00, 14.94s/it] 46%|████▋ | 222/478 [58:06<1:03:45, 14.95s/it] {'loss': '0.2183', 'grad_norm': '5.125', 'learning_rate': '1.298e-05', 'ppl': '1.244', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.47', 'tokens/total': 7104000, 'tokens/trainable': 137900, 'epoch': '0.925'}
46%|████▋ | 222/478 [58:06<1:03:45, 14.95s/it] 47%|████▋ | 223/478 [58:21<1:03:31, 14.95s/it] {'loss': '0.2695', 'grad_norm': '6.469', 'learning_rate': '1.291e-05', 'ppl': '1.309', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.2', 'tokens/total': 7136000, 'tokens/trainable': 138533, 'epoch': '0.9292'}
47%|████▋ | 223/478 [58:21<1:03:31, 14.95s/it] 47%|████▋ | 224/478 [58:36<1:03:16, 14.95s/it] {'loss': '0.1685', 'grad_norm': '5.625', 'learning_rate': '1.284e-05', 'ppl': '1.183', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.9', 'tokens/total': 7168000, 'tokens/trainable': 139157, 'epoch': '0.9333'}
47%|████▋ | 224/478 [58:36<1:03:16, 14.95s/it] 47%|████▋ | 225/478 [58:51<1:03:02, 14.95s/it] {'loss': '0.1929', 'grad_norm': '4.969', 'learning_rate': '1.277e-05', 'ppl': '1.213', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.86', 'tokens/total': 7200000, 'tokens/trainable': 139810, 'epoch': '0.9375'}
47%|████▋ | 225/478 [58:51<1:03:02, 14.95s/it] 47%|████▋ | 226/478 [59:06<1:02:47, 14.95s/it] {'loss': '0.2168', 'grad_norm': '5.719', 'learning_rate': '1.27e-05', 'ppl': '1.242', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.69', 'tokens/total': 7232000, 'tokens/trainable': 140398, 'epoch': '0.9417'}
47%|████▋ | 226/478 [59:06<1:02:47, 14.95s/it] 47%|████▋ | 227/478 [59:21<1:02:32, 14.95s/it] {'loss': '0.2075', 'grad_norm': '5.344', 'learning_rate': '1.263e-05', 'ppl': '1.231', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.47', 'tokens/total': 7264000, 'tokens/trainable': 141069, 'epoch': '0.9458'}
47%|████▋ | 227/478 [59:21<1:02:32, 14.95s/it] 48%|████▊ | 228/478 [59:36<1:02:17, 14.95s/it] {'loss': '0.2222', 'grad_norm': '4.969', 'learning_rate': '1.256e-05', 'ppl': '1.249', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.02', 'tokens/total': 7296000, 'tokens/trainable': 141697, 'epoch': '0.95'}
48%|████▊ | 228/478 [59:36<1:02:17, 14.95s/it] 48%|████▊ | 229/478 [59:51<1:02:02, 14.95s/it] {'loss': '0.2402', 'grad_norm': '5.469', 'learning_rate': '1.249e-05', 'ppl': '1.272', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '17.82', 'tokens/total': 7328000, 'tokens/trainable': 142229, 'epoch': '0.9542'}
48%|████▊ | 229/478 [59:51<1:02:02, 14.95s/it] 48%|████▊ | 230/478 [1:00:06<1:01:47, 14.95s/it] {'loss': '0.2148', 'grad_norm': '5.625', 'learning_rate': '1.242e-05', 'ppl': '1.24', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.49', 'tokens/total': 7360000, 'tokens/trainable': 142841, 'epoch': '0.9583'}
48%|████▊ | 230/478 [1:00:06<1:01:47, 14.95s/it] 48%|████▊ | 231/478 [1:00:21<1:01:32, 14.95s/it] {'loss': '0.188', 'grad_norm': '5.344', 'learning_rate': '1.235e-05', 'ppl': '1.207', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.23', 'tokens/total': 7392000, 'tokens/trainable': 143475, 'epoch': '0.9625'}
48%|████▊ | 231/478 [1:00:21<1:01:32, 14.95s/it] 49%|████▊ | 232/478 [1:00:36<1:01:17, 14.95s/it] {'loss': '0.2285', 'grad_norm': '5.969', 'learning_rate': '1.228e-05', 'ppl': '1.257', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '17.81', 'tokens/total': 7424000, 'tokens/trainable': 144007, 'epoch': '0.9667'}
49%|████▊ | 232/478 [1:00:36<1:01:17, 14.95s/it] 49%|████▊ | 233/478 [1:00:51<1:01:05, 14.96s/it] {'loss': '0.1855', 'grad_norm': '5.375', 'learning_rate': '1.22e-05', 'ppl': '1.204', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.28', 'tokens/total': 7456000, 'tokens/trainable': 144584, 'epoch': '0.9708'}
49%|████▊ | 233/478 [1:00:51<1:01:05, 14.96s/it] 49%|████▉ | 234/478 [1:01:06<1:00:48, 14.95s/it] {'loss': '0.1929', 'grad_norm': '4.344', 'learning_rate': '1.213e-05', 'ppl': '1.213', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '23.1', 'tokens/total': 7488000, 'tokens/trainable': 145273, 'epoch': '0.975'}
49%|████▉ | 234/478 [1:01:06<1:00:48, 14.95s/it] 49%|████▉ | 235/478 [1:01:21<1:00:33, 14.95s/it] {'loss': '0.1831', 'grad_norm': '5.031', 'learning_rate': '1.206e-05', 'ppl': '1.201', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.3', 'tokens/total': 7520000, 'tokens/trainable': 145879, 'epoch': '0.9792'}
49%|████▉ | 235/478 [1:01:21<1:00:33, 14.95s/it] 49%|████▉ | 236/478 [1:01:36<1:00:18, 14.95s/it] {'loss': '0.2319', 'grad_norm': '5.719', 'learning_rate': '1.199e-05', 'ppl': '1.261', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.52', 'tokens/total': 7552000, 'tokens/trainable': 146552, 'epoch': '0.9833'}
49%|████▉ | 236/478 [1:01:36<1:00:18, 14.95s/it] 50%|████▉ | 237/478 [1:01:51<1:00:03, 14.95s/it] {'loss': '0.251', 'grad_norm': '5.812', 'learning_rate': '1.192e-05', 'ppl': '1.285', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.9', 'tokens/total': 7584000, 'tokens/trainable': 147206, 'epoch': '0.9875'}
50%|████▉ | 237/478 [1:01:51<1:00:03, 14.95s/it] 50%|████▉ | 238/478 [1:02:06<59:48, 14.95s/it] {'loss': '0.2085', 'grad_norm': '5.406', 'learning_rate': '1.185e-05', 'ppl': '1.232', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.62', 'tokens/total': 7616000, 'tokens/trainable': 147792, 'epoch': '0.9917'}
50%|████▉ | 238/478 [1:02:06<59:48, 14.95s/it] 50%|█████ | 239/478 [1:02:21<59:33, 14.95s/it] {'loss': '0.2236', 'grad_norm': '6.562', 'learning_rate': '1.178e-05', 'ppl': '1.251', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.16', 'tokens/total': 7648000, 'tokens/trainable': 148424, 'epoch': '0.9958'}
50%|█████ | 239/478 [1:02:21<59:33, 14.95s/it][2026-04-17 03:11:51,731] [INFO] [axolotl.core.trainers.base] Saving model checkpoint to /workspace/data/outputs/qwen3-4B/fft_magnifi-module-classifier-04-17-relabelled-upsampled/checkpoint-239
Writing model shards: 0%| | 0/1 [00:00<?, ?it/s]
Writing model shards: 100%|██████████| 1/1 [00:22<00:00, 22.99s/it] Writing model shards: 100%|██████████| 1/1 [00:22<00:00, 22.99s/it]
50%|█████ | 240/478 [1:04:39<3:26:42, 52.11s/it] {'loss': '0.229', 'grad_norm': '5.062', 'learning_rate': '1.17e-05', 'ppl': '1.257', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.67', 'tokens/total': 7680000, 'tokens/trainable': 149041, 'epoch': '1'}
50%|█████ | 240/478 [1:04:39<3:26:42, 52.11s/it][2026-04-17 03:14:04,074] [INFO] [axolotl.core.trainers.base] Running evaluation step...
[2026-04-17 03:14:12,814] [INFO] [axolotl.utils.samplers.multipack] gather_len_batches: [54, 54]
0%| | 0/27 [00:00<?, ?it/s]
7%|▋ | 2/27 [00:02<00:35, 1.41s/it]
11%|█ | 3/27 [00:05<00:47, 1.97s/it]
15%|█▍ | 4/27 [00:08<00:51, 2.26s/it]
19%|█▊ | 5/27 [00:11<00:53, 2.43s/it]
22%|██▏ | 6/27 [00:13<00:53, 2.54s/it]
26%|██▌ | 7/27 [00:16<00:52, 2.61s/it]
30%|██▉ | 8/27 [00:19<00:50, 2.66s/it]
33%|███▎ | 9/27 [00:22<00:48, 2.68s/it]
37%|███▋ | 10/27 [00:24<00:45, 2.70s/it]
41%|████ | 11/27 [00:27<00:43, 2.72s/it]
44%|████▍ | 12/27 [00:30<00:39, 2.64s/it]
48%|████▊ | 13/27 [00:33<00:38, 2.76s/it]
52%|█████▏ | 14/27 [00:35<00:35, 2.76s/it]
56%|█████▌ | 15/27 [00:38<00:33, 2.76s/it]
59%|█████▉ | 16/27 [00:41<00:30, 2.76s/it]
63%|██████▎ | 17/27 [00:44<00:27, 2.75s/it]
67%|██████▋ | 18/27 [00:46<00:24, 2.75s/it]
70%|███████ | 19/27 [00:49<00:22, 2.75s/it]
74%|███████▍ | 20/27 [00:52<00:19, 2.75s/it]
78%|███████▊ | 21/27 [00:54<00:16, 2.67s/it]
81%|████████▏ | 22/27 [00:57<00:13, 2.78s/it]
85%|████████▌ | 23/27 [01:00<00:11, 2.77s/it]
89%|████████▉ | 24/27 [01:03<00:08, 2.76s/it]
93%|█████████▎| 25/27 [01:06<00:05, 2.76s/it]
96%|█████████▋| 26/27 [01:08<00:02, 2.76s/it]
100%|██████████| 27/27 [01:11<00:00, 2.84s/it]
{'eval_loss': '0.2166', 'eval_runtime': '75.42', 'eval_samples_per_second': '2.771', 'eval_steps_per_second': '1.392', 'eval_ppl': '1.242', 'memory/max_active (GiB)': '34.91', 'memory/max_allocated (GiB)': '34.91', 'memory/device_reserved (GiB)': '57.54', 'epoch': '1', 'tokens/train_per_sec_per_gpu': '0'}
50%|█████ | 240/478 [1:06:04<3:26:42, 52.11s/it]
100%|██████████| 27/27 [01:13<00:00, 2.84s/it]
50%|█████ | 241/478 [1:06:20<4:22:47, 66.53s/it] {'loss': '0.2046', 'grad_norm': '5.375', 'learning_rate': '1.163e-05', 'ppl': '1.227', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.95', 'tokens/total': 7712000, 'tokens/trainable': 149728, 'epoch': '1.004'}
50%|█████ | 241/478 [1:06:20<4:22:47, 66.53s/it] 51%|█████ | 242/478 [1:06:35<3:20:49, 51.06s/it] {'loss': '0.2031', 'grad_norm': '5.406', 'learning_rate': '1.156e-05', 'ppl': '1.225', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.85', 'tokens/total': 7744000, 'tokens/trainable': 150351, 'epoch': '1.008'}
51%|█████ | 242/478 [1:06:35<3:20:49, 51.06s/it] 51%|█████ | 243/478 [1:06:50<2:37:33, 40.23s/it] {'loss': '0.1758', 'grad_norm': '5', 'learning_rate': '1.149e-05', 'ppl': '1.192', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.47', 'tokens/total': 7776000, 'tokens/trainable': 150903, 'epoch': '1.012'}
51%|█████ | 243/478 [1:06:50<2:37:33, 40.23s/it] 51%|█████ | 244/478 [1:07:04<2:07:19, 32.65s/it] {'loss': '0.1514', 'grad_norm': '4.969', 'learning_rate': '1.142e-05', 'ppl': '1.163', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.07', 'tokens/total': 7808000, 'tokens/trainable': 151443, 'epoch': '1.017'}
51%|█████ | 244/478 [1:07:04<2:07:19, 32.65s/it] 51%|█████▏ | 245/478 [1:07:19<1:46:10, 27.34s/it] {'loss': '0.186', 'grad_norm': '4.875', 'learning_rate': '1.134e-05', 'ppl': '1.204', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.38', 'tokens/total': 7840000, 'tokens/trainable': 151992, 'epoch': '1.021'}
51%|█████▏ | 245/478 [1:07:19<1:46:10, 27.34s/it] 51%|█████▏ | 246/478 [1:07:34<1:31:21, 23.63s/it] {'loss': '0.1411', 'grad_norm': '3.984', 'learning_rate': '1.127e-05', 'ppl': '1.152', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.85', 'tokens/total': 7872000, 'tokens/trainable': 152615, 'epoch': '1.025'}
51%|█████▏ | 246/478 [1:07:34<1:31:21, 23.63s/it] 52%|█████▏ | 247/478 [1:07:49<1:20:56, 21.02s/it] {'loss': '0.2158', 'grad_norm': '6.188', 'learning_rate': '1.12e-05', 'ppl': '1.241', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.35', 'tokens/total': 7904000, 'tokens/trainable': 153193, 'epoch': '1.029'}
52%|█████▏ | 247/478 [1:07:49<1:20:56, 21.02s/it] 52%|█████▏ | 248/478 [1:08:04<1:13:37, 19.20s/it] {'loss': '0.1768', 'grad_norm': '4.625', 'learning_rate': '1.113e-05', 'ppl': '1.193', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.75', 'tokens/total': 7936000, 'tokens/trainable': 153783, 'epoch': '1.033'}
52%|█████▏ | 248/478 [1:08:04<1:13:37, 19.20s/it] 52%|█████▏ | 249/478 [1:08:19<1:08:26, 17.93s/it] {'loss': '0.1704', 'grad_norm': '4.625', 'learning_rate': '1.105e-05', 'ppl': '1.186', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.35', 'tokens/total': 7968000, 'tokens/trainable': 154451, 'epoch': '1.038'}
52%|█████▏ | 249/478 [1:08:19<1:08:26, 17.93s/it] 52%|█████▏ | 250/478 [1:08:34<1:04:44, 17.04s/it] {'loss': '0.186', 'grad_norm': '4.688', 'learning_rate': '1.098e-05', 'ppl': '1.204', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.79', 'tokens/total': 8000000, 'tokens/trainable': 155072, 'epoch': '1.042'}
52%|█████▏ | 250/478 [1:08:34<1:04:44, 17.04s/it] 53%|█████▎ | 251/478 [1:08:49<1:02:06, 16.42s/it] {'loss': '0.1641', 'grad_norm': '4.125', 'learning_rate': '1.091e-05', 'ppl': '1.178', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.54', 'tokens/total': 8032000, 'tokens/trainable': 155656, 'epoch': '1.046'}
53%|█████▎ | 251/478 [1:08:49<1:02:06, 16.42s/it] 53%|█████▎ | 252/478 [1:09:04<1:00:10, 15.98s/it] {'loss': '0.1631', 'grad_norm': '4.5', 'learning_rate': '1.084e-05', 'ppl': '1.177', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.02', 'tokens/total': 8064000, 'tokens/trainable': 156254, 'epoch': '1.05'}
53%|█████▎ | 252/478 [1:09:04<1:00:10, 15.98s/it] 53%|█████▎ | 253/478 [1:09:19<58:45, 15.67s/it] {'loss': '0.2163', 'grad_norm': '4.969', 'learning_rate': '1.076e-05', 'ppl': '1.241', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.45', 'tokens/total': 8096000, 'tokens/trainable': 156805, 'epoch': '1.054'}
53%|█████▎ | 253/478 [1:09:19<58:45, 15.67s/it] 53%|█████▎ | 254/478 [1:09:34<57:42, 15.46s/it] {'loss': '0.1792', 'grad_norm': '4.625', 'learning_rate': '1.069e-05', 'ppl': '1.196', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.95', 'tokens/total': 8128000, 'tokens/trainable': 157371, 'epoch': '1.058'}
53%|█████▎ | 254/478 [1:09:34<57:42, 15.46s/it] 53%|█████▎ | 255/478 [1:09:49<56:53, 15.31s/it] {'loss': '0.1528', 'grad_norm': '5.031', 'learning_rate': '1.062e-05', 'ppl': '1.165', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.46', 'tokens/total': 8160000, 'tokens/trainable': 158042, 'epoch': '1.062'}
53%|█████▎ | 255/478 [1:09:49<56:53, 15.31s/it] 54%|█████▎ | 256/478 [1:10:04<56:14, 15.20s/it] {'loss': '0.1448', 'grad_norm': '4.219', 'learning_rate': '1.055e-05', 'ppl': '1.156', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.79', 'tokens/total': 8192000, 'tokens/trainable': 158693, 'epoch': '1.067'}
54%|█████▎ | 256/478 [1:10:04<56:14, 15.20s/it] 54%|█████▍ | 257/478 [1:10:19<55:41, 15.12s/it] {'loss': '0.09375', 'grad_norm': '3.844', 'learning_rate': '1.047e-05', 'ppl': '1.098', 'memory/max_active (GiB)': '45.17', 'memory/max_allocated (GiB)': '45.17', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '23.17', 'tokens/total': 8224000, 'tokens/trainable': 159384, 'epoch': '1.071'}
54%|█████▍ | 257/478 [1:10:19<55:41, 15.12s/it] 54%|█████▍ | 258/478 [1:10:34<55:16, 15.07s/it] {'loss': '0.1543', 'grad_norm': '4.625', 'learning_rate': '1.04e-05', 'ppl': '1.167', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.72', 'tokens/total': 8256000, 'tokens/trainable': 160063, 'epoch': '1.075'}
54%|█████▍ | 258/478 [1:10:34<55:16, 15.07s/it] 54%|█████▍ | 259/478 [1:10:49<54:54, 15.05s/it] {'loss': '0.187', 'grad_norm': '5.406', 'learning_rate': '1.033e-05', 'ppl': '1.206', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.78', 'tokens/total': 8288000, 'tokens/trainable': 160625, 'epoch': '1.079'}
54%|█████▍ | 259/478 [1:10:49<54:54, 15.05s/it] 54%|█████▍ | 260/478 [1:11:04<54:33, 15.02s/it] {'loss': '0.1433', 'grad_norm': '4.312', 'learning_rate': '1.026e-05', 'ppl': '1.154', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.33', 'tokens/total': 8320000, 'tokens/trainable': 161262, 'epoch': '1.083'}
54%|█████▍ | 260/478 [1:11:04<54:33, 15.02s/it] 55%|█████▍ | 261/478 [1:11:19<54:49, 15.16s/it] {'loss': '0.1436', 'grad_norm': '4.25', 'learning_rate': '1.018e-05', 'ppl': '1.154', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.3', 'tokens/total': 8352000, 'tokens/trainable': 161921, 'epoch': '1.087'}
55%|█████▍ | 261/478 [1:11:19<54:49, 15.16s/it] 55%|█████▍ | 262/478 [1:11:34<54:21, 15.10s/it] {'loss': '0.168', 'grad_norm': '4.5', 'learning_rate': '1.011e-05', 'ppl': '1.183', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.92', 'tokens/total': 8384000, 'tokens/trainable': 162546, 'epoch': '1.092'}
55%|█████▍ | 262/478 [1:11:34<54:21, 15.10s/it] 55%|█████▌ | 263/478 [1:11:49<53:55, 15.05s/it] {'loss': '0.1802', 'grad_norm': '5.188', 'learning_rate': '1.004e-05', 'ppl': '1.197', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.87', 'tokens/total': 8416000, 'tokens/trainable': 163228, 'epoch': '1.096'}
55%|█████▌ | 263/478 [1:11:49<53:55, 15.05s/it] 55%|█████▌ | 264/478 [1:12:04<53:34, 15.02s/it] {'loss': '0.1252', 'grad_norm': '4.281', 'learning_rate': '9.964e-06', 'ppl': '1.133', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '16.5', 'tokens/total': 8448000, 'tokens/trainable': 163721, 'epoch': '1.1'}
55%|█████▌ | 264/478 [1:12:04<53:34, 15.02s/it] 55%|█████▌ | 265/478 [1:12:19<53:15, 15.00s/it] {'loss': '0.1909', 'grad_norm': '4.844', 'learning_rate': '9.891e-06', 'ppl': '1.21', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.59', 'tokens/total': 8480000, 'tokens/trainable': 164366, 'epoch': '1.104'}
55%|█████▌ | 265/478 [1:12:19<53:15, 15.00s/it] 56%|█████▌ | 266/478 [1:12:34<52:57, 14.99s/it] {'loss': '0.1145', 'grad_norm': '4.062', 'learning_rate': '9.818e-06', 'ppl': '1.121', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.35', 'tokens/total': 8512000, 'tokens/trainable': 164944, 'epoch': '1.108'}
56%|█████▌ | 266/478 [1:12:34<52:57, 14.99s/it] 56%|█████▌ | 267/478 [1:12:49<52:40, 14.98s/it] {'loss': '0.1013', 'grad_norm': '3.5', 'learning_rate': '9.745e-06', 'ppl': '1.107', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.17', 'tokens/total': 8544000, 'tokens/trainable': 165576, 'epoch': '1.113'}
56%|█████▌ | 267/478 [1:12:49<52:40, 14.98s/it] 56%|█████▌ | 268/478 [1:13:04<52:22, 14.96s/it] {'loss': '0.1455', 'grad_norm': '4.281', 'learning_rate': '9.672e-06', 'ppl': '1.157', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '23.01', 'tokens/total': 8576000, 'tokens/trainable': 166262, 'epoch': '1.117'}
56%|█████▌ | 268/478 [1:13:04<52:22, 14.96s/it] 56%|█████▋ | 269/478 [1:13:19<52:06, 14.96s/it] {'loss': '0.1101', 'grad_norm': '3.75', 'learning_rate': '9.599e-06', 'ppl': '1.116', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.57', 'tokens/total': 8608000, 'tokens/trainable': 166936, 'epoch': '1.121'}
56%|█████▋ | 269/478 [1:13:19<52:06, 14.96s/it] 56%|█████▋ | 270/478 [1:13:34<51:51, 14.96s/it] {'loss': '0.1221', 'grad_norm': '4.031', 'learning_rate': '9.526e-06', 'ppl': '1.13', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '24.2', 'tokens/total': 8640000, 'tokens/trainable': 167659, 'epoch': '1.125'}
56%|█████▋ | 270/478 [1:13:34<51:51, 14.96s/it] 57%|█████▋ | 271/478 [1:13:49<51:36, 14.96s/it] {'loss': '0.09717', 'grad_norm': '4', 'learning_rate': '9.454e-06', 'ppl': '1.102', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.16', 'tokens/total': 8672000, 'tokens/trainable': 168291, 'epoch': '1.129'}
57%|█████▋ | 271/478 [1:13:49<51:36, 14.96s/it] 57%|█████▋ | 272/478 [1:14:04<51:21, 14.96s/it] {'loss': '0.123', 'grad_norm': '3.75', 'learning_rate': '9.381e-06', 'ppl': '1.131', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.56', 'tokens/total': 8704000, 'tokens/trainable': 168965, 'epoch': '1.133'}
57%|█████▋ | 272/478 [1:14:04<51:21, 14.96s/it] 57%|█████▋ | 273/478 [1:14:19<51:06, 14.96s/it] {'loss': '0.1218', 'grad_norm': '4.906', 'learning_rate': '9.308e-06', 'ppl': '1.13', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.79', 'tokens/total': 8736000, 'tokens/trainable': 169586, 'epoch': '1.137'}
57%|█████▋ | 273/478 [1:14:19<51:06, 14.96s/it] 57%|█████▋ | 274/478 [1:14:34<50:50, 14.96s/it] {'loss': '0.1606', 'grad_norm': '5.156', 'learning_rate': '9.235e-06', 'ppl': '1.174', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.16', 'tokens/total': 8768000, 'tokens/trainable': 170188, 'epoch': '1.142'}
57%|█████▋ | 274/478 [1:14:34<50:50, 14.96s/it] 58%|█████▊ | 275/478 [1:14:49<50:35, 14.95s/it] {'loss': '0.1206', 'grad_norm': '4.062', 'learning_rate': '9.163e-06', 'ppl': '1.128', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '23.04', 'tokens/total': 8800000, 'tokens/trainable': 170876, 'epoch': '1.146'}
58%|█████▊ | 275/478 [1:14:49<50:35, 14.95s/it] 58%|█████▊ | 276/478 [1:15:04<50:20, 14.95s/it] {'loss': '0.123', 'grad_norm': '4.781', 'learning_rate': '9.09e-06', 'ppl': '1.131', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.44', 'tokens/total': 8832000, 'tokens/trainable': 171546, 'epoch': '1.15'}
58%|█████▊ | 276/478 [1:15:04<50:20, 14.95s/it] 58%|█████▊ | 277/478 [1:15:18<50:05, 14.95s/it] {'loss': '0.08105', 'grad_norm': '3.578', 'learning_rate': '9.018e-06', 'ppl': '1.084', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.29', 'tokens/total': 8864000, 'tokens/trainable': 172122, 'epoch': '1.154'}
58%|█████▊ | 277/478 [1:15:19<50:05, 14.95s/it] 58%|█████▊ | 278/478 [1:15:33<49:50, 14.95s/it] {'loss': '0.1572', 'grad_norm': '4.594', 'learning_rate': '8.945e-06', 'ppl': '1.17', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '23.03', 'tokens/total': 8896000, 'tokens/trainable': 172810, 'epoch': '1.158'}
58%|█████▊ | 278/478 [1:15:33<49:50, 14.95s/it] 58%|█████▊ | 279/478 [1:15:48<49:34, 14.95s/it] {'loss': '0.09741', 'grad_norm': '3.922', 'learning_rate': '8.873e-06', 'ppl': '1.102', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '24.94', 'tokens/total': 8928000, 'tokens/trainable': 173554, 'epoch': '1.163'}
58%|█████▊ | 279/478 [1:15:48<49:34, 14.95s/it] 59%|█████▊ | 280/478 [1:16:03<49:20, 14.95s/it] {'loss': '0.09204', 'grad_norm': '3.609', 'learning_rate': '8.8e-06', 'ppl': '1.096', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.34', 'tokens/total': 8960000, 'tokens/trainable': 174162, 'epoch': '1.167'}
59%|█████▊ | 280/478 [1:16:03<49:20, 14.95s/it] 59%|█████▉ | 281/478 [1:16:18<49:05, 14.95s/it] {'loss': '0.09448', 'grad_norm': '3.656', 'learning_rate': '8.728e-06', 'ppl': '1.099', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.42', 'tokens/total': 8992000, 'tokens/trainable': 174802, 'epoch': '1.171'}
59%|█████▉ | 281/478 [1:16:18<49:05, 14.95s/it] 59%|█████▉ | 282/478 [1:16:33<48:50, 14.95s/it] {'loss': '0.09424', 'grad_norm': '3.75', 'learning_rate': '8.656e-06', 'ppl': '1.099', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '24.35', 'tokens/total': 9024000, 'tokens/trainable': 175529, 'epoch': '1.175'}
59%|█████▉ | 282/478 [1:16:33<48:50, 14.95s/it] 59%|█████▉ | 283/478 [1:16:48<48:34, 14.95s/it] {'loss': '0.1201', 'grad_norm': '4.125', 'learning_rate': '8.583e-06', 'ppl': '1.128', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.32', 'tokens/total': 9056000, 'tokens/trainable': 176195, 'epoch': '1.179'}
59%|█████▉ | 283/478 [1:16:48<48:34, 14.95s/it] 59%|█████▉ | 284/478 [1:17:03<48:20, 14.95s/it] {'loss': '0.1104', 'grad_norm': '3.953', 'learning_rate': '8.511e-06', 'ppl': '1.117', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.7', 'tokens/total': 9088000, 'tokens/trainable': 176843, 'epoch': '1.183'}
59%|█████▉ | 284/478 [1:17:03<48:20, 14.95s/it] 60%|█████▉ | 285/478 [1:17:18<48:05, 14.95s/it] {'loss': '0.1064', 'grad_norm': '4.281', 'learning_rate': '8.439e-06', 'ppl': '1.112', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.53', 'tokens/total': 9120000, 'tokens/trainable': 177516, 'epoch': '1.188'}
60%|█████▉ | 285/478 [1:17:18<48:05, 14.95s/it] 60%|█████▉ | 286/478 [1:17:33<47:51, 14.95s/it] {'loss': '0.1108', 'grad_norm': '4.344', 'learning_rate': '8.367e-06', 'ppl': '1.117', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.88', 'tokens/total': 9152000, 'tokens/trainable': 178140, 'epoch': '1.192'}
60%|█████▉ | 286/478 [1:17:33<47:51, 14.95s/it] 60%|██████ | 287/478 [1:17:48<47:36, 14.95s/it] {'loss': '0.09253', 'grad_norm': '3.922', 'learning_rate': '8.295e-06', 'ppl': '1.097', 'memory/max_active (GiB)': '45.17', 'memory/max_allocated (GiB)': '45.17', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.5', 'tokens/total': 9184000, 'tokens/trainable': 178812, 'epoch': '1.196'}
60%|██████ | 287/478 [1:17:48<47:36, 14.95s/it] 60%|██████ | 288/478 [1:18:03<47:21, 14.96s/it] {'loss': '0.1304', 'grad_norm': '5.625', 'learning_rate': '8.224e-06', 'ppl': '1.139', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.64', 'tokens/total': 9216000, 'tokens/trainable': 179369, 'epoch': '1.2'}
60%|██████ | 288/478 [1:18:03<47:21, 14.96s/it] 60%|██████ | 289/478 [1:18:18<47:06, 14.96s/it] {'loss': '0.1108', 'grad_norm': '4.625', 'learning_rate': '8.152e-06', 'ppl': '1.117', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.58', 'tokens/total': 9248000, 'tokens/trainable': 179984, 'epoch': '1.204'}
60%|██████ | 289/478 [1:18:18<47:06, 14.96s/it] 61%|██████ | 290/478 [1:18:33<46:51, 14.96s/it] {'loss': '0.09741', 'grad_norm': '5.594', 'learning_rate': '8.08e-06', 'ppl': '1.102', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.95', 'tokens/total': 9280000, 'tokens/trainable': 180550, 'epoch': '1.208'}
61%|██████ | 290/478 [1:18:33<46:51, 14.96s/it] 61%|██████ | 291/478 [1:18:48<46:36, 14.95s/it] {'loss': '0.08936', 'grad_norm': '4.219', 'learning_rate': '8.009e-06', 'ppl': '1.093', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.25', 'tokens/total': 9312000, 'tokens/trainable': 181095, 'epoch': '1.212'}
61%|██████ | 291/478 [1:18:48<46:36, 14.95s/it] 61%|██████ | 292/478 [1:19:03<46:21, 14.95s/it] {'loss': '0.134', 'grad_norm': '5.094', 'learning_rate': '7.938e-06', 'ppl': '1.143', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.67', 'tokens/total': 9344000, 'tokens/trainable': 181680, 'epoch': '1.217'}
61%|██████ | 292/478 [1:19:03<46:21, 14.95s/it] 61%|██████▏ | 293/478 [1:19:18<46:06, 14.95s/it] {'loss': '0.08301', 'grad_norm': '4.469', 'learning_rate': '7.866e-06', 'ppl': '1.087', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.52', 'tokens/total': 9376000, 'tokens/trainable': 182323, 'epoch': '1.221'}
61%|██████▏ | 293/478 [1:19:18<46:06, 14.95s/it] 62%|██████▏ | 294/478 [1:19:33<45:50, 14.95s/it] {'loss': '0.1326', 'grad_norm': '4.75', 'learning_rate': '7.795e-06', 'ppl': '1.142', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.77', 'tokens/total': 9408000, 'tokens/trainable': 182972, 'epoch': '1.225'}
62%|██████▏ | 294/478 [1:19:33<45:50, 14.95s/it] 62%|██████▏ | 295/478 [1:19:48<45:34, 14.94s/it] {'loss': '0.09399', 'grad_norm': '5.062', 'learning_rate': '7.724e-06', 'ppl': '1.099', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '23.34', 'tokens/total': 9440000, 'tokens/trainable': 183668, 'epoch': '1.229'}
62%|██████▏ | 295/478 [1:19:48<45:34, 14.94s/it] 62%|██████▏ | 296/478 [1:20:03<45:19, 14.94s/it] {'loss': '0.05933', 'grad_norm': '4.625', 'learning_rate': '7.653e-06', 'ppl': '1.061', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.68', 'tokens/total': 9472000, 'tokens/trainable': 184226, 'epoch': '1.233'}
62%|██████▏ | 296/478 [1:20:03<45:19, 14.94s/it] 62%|██████▏ | 297/478 [1:20:17<45:04, 14.94s/it] {'loss': '0.1252', 'grad_norm': '5.25', 'learning_rate': '7.582e-06', 'ppl': '1.133', 'memory/max_active (GiB)': '45.17', 'memory/max_allocated (GiB)': '45.17', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '23', 'tokens/total': 9504000, 'tokens/trainable': 184912, 'epoch': '1.238'}
62%|██████▏ | 297/478 [1:20:17<45:04, 14.94s/it] 62%|██████▏ | 298/478 [1:20:32<44:49, 14.94s/it] {'loss': '0.1157', 'grad_norm': '5.531', 'learning_rate': '7.512e-06', 'ppl': '1.123', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.55', 'tokens/total': 9536000, 'tokens/trainable': 185466, 'epoch': '1.242'}
62%|██████▏ | 298/478 [1:20:32<44:49, 14.94s/it] 63%|██████▎ | 299/478 [1:20:47<44:37, 14.96s/it] {'loss': '0.07788', 'grad_norm': '4.562', 'learning_rate': '7.441e-06', 'ppl': '1.081', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.14', 'tokens/total': 9568000, 'tokens/trainable': 186069, 'epoch': '1.246'}
63%|██████▎ | 299/478 [1:20:47<44:37, 14.96s/it] 63%|██████▎ | 300/478 [1:21:02<44:22, 14.96s/it] {'loss': '0.1213', 'grad_norm': '5.469', 'learning_rate': '7.371e-06', 'ppl': '1.129', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.9', 'tokens/total': 9600000, 'tokens/trainable': 186753, 'epoch': '1.25'}
63%|██████▎ | 300/478 [1:21:02<44:22, 14.96s/it] 63%|██████▎ | 301/478 [1:21:17<44:03, 14.94s/it] {'loss': '0.08154', 'grad_norm': '3.797', 'learning_rate': '7.301e-06', 'ppl': '1.085', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '24.01', 'tokens/total': 9632000, 'tokens/trainable': 187467, 'epoch': '1.254'}
63%|██████▎ | 301/478 [1:21:17<44:03, 14.94s/it] 63%|██████▎ | 302/478 [1:21:32<43:50, 14.94s/it] {'loss': '0.0835', 'grad_norm': '4.406', 'learning_rate': '7.23e-06', 'ppl': '1.087', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.69', 'tokens/total': 9664000, 'tokens/trainable': 188085, 'epoch': '1.258'}
63%|██████▎ | 302/478 [1:21:32<43:50, 14.94s/it] 63%|██████▎ | 303/478 [1:21:47<43:36, 14.95s/it] {'loss': '0.1338', 'grad_norm': '5.219', 'learning_rate': '7.16e-06', 'ppl': '1.143', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.31', 'tokens/total': 9696000, 'tokens/trainable': 188692, 'epoch': '1.262'}
63%|██████▎ | 303/478 [1:21:47<43:36, 14.95s/it] 64%|██████▎ | 304/478 [1:22:02<43:21, 14.95s/it] {'loss': '0.1152', 'grad_norm': '4.344', 'learning_rate': '7.091e-06', 'ppl': '1.122', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.93', 'tokens/total': 9728000, 'tokens/trainable': 189377, 'epoch': '1.267'}
64%|██████▎ | 304/478 [1:22:02<43:21, 14.95s/it] 64%|██████▍ | 305/478 [1:22:17<43:06, 14.95s/it] {'loss': '0.04266', 'grad_norm': '3.094', 'learning_rate': '7.021e-06', 'ppl': '1.044', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.04', 'tokens/total': 9760000, 'tokens/trainable': 189916, 'epoch': '1.271'}
64%|██████▍ | 305/478 [1:22:17<43:06, 14.95s/it] 64%|██████▍ | 306/478 [1:22:32<42:52, 14.95s/it] {'loss': '0.09497', 'grad_norm': '4.875', 'learning_rate': '6.951e-06', 'ppl': '1.1', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.25', 'tokens/total': 9792000, 'tokens/trainable': 190551, 'epoch': '1.275'}
64%|██████▍ | 306/478 [1:22:32<42:52, 14.95s/it] 64%|██████▍ | 307/478 [1:22:47<42:37, 14.95s/it] {'loss': '0.08691', 'grad_norm': '3.906', 'learning_rate': '6.882e-06', 'ppl': '1.091', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.1', 'tokens/total': 9824000, 'tokens/trainable': 191211, 'epoch': '1.279'}
64%|██████▍ | 307/478 [1:22:47<42:37, 14.95s/it] 64%|██████▍ | 308/478 [1:23:02<42:22, 14.95s/it] {'loss': '0.1401', 'grad_norm': '5.781', 'learning_rate': '6.813e-06', 'ppl': '1.15', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '16.63', 'tokens/total': 9856000, 'tokens/trainable': 191708, 'epoch': '1.283'}
64%|██████▍ | 308/478 [1:23:02<42:22, 14.95s/it] 65%|██████▍ | 309/478 [1:23:17<42:07, 14.96s/it] {'loss': '0.0658', 'grad_norm': '3.594', 'learning_rate': '6.744e-06', 'ppl': '1.068', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '17.97', 'tokens/total': 9888000, 'tokens/trainable': 192245, 'epoch': '1.288'}
65%|██████▍ | 309/478 [1:23:17<42:07, 14.96s/it] 65%|██████▍ | 310/478 [1:23:32<41:52, 14.96s/it] {'loss': '0.07153', 'grad_norm': '3.516', 'learning_rate': '6.675e-06', 'ppl': '1.074', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.16', 'tokens/total': 9920000, 'tokens/trainable': 192907, 'epoch': '1.292'}
65%|██████▍ | 310/478 [1:23:32<41:52, 14.96s/it] 65%|██████▌ | 311/478 [1:23:47<41:37, 14.96s/it] {'loss': '0.1274', 'grad_norm': '4.625', 'learning_rate': '6.606e-06', 'ppl': '1.136', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.87', 'tokens/total': 9952000, 'tokens/trainable': 193590, 'epoch': '1.296'}
65%|██████▌ | 311/478 [1:23:47<41:37, 14.96s/it] 65%|██████▌ | 312/478 [1:24:02<41:23, 14.96s/it] {'loss': '0.0614', 'grad_norm': '3.797', 'learning_rate': '6.538e-06', 'ppl': '1.063', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.28', 'tokens/total': 9984000, 'tokens/trainable': 194196, 'epoch': '1.3'}
65%|██████▌ | 312/478 [1:24:02<41:23, 14.96s/it] 65%|██████▌ | 313/478 [1:24:17<41:08, 14.96s/it] {'loss': '0.06641', 'grad_norm': '5.344', 'learning_rate': '6.47e-06', 'ppl': '1.069', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.52', 'tokens/total': 10016000, 'tokens/trainable': 194779, 'epoch': '1.304'}
65%|██████▌ | 313/478 [1:24:17<41:08, 14.96s/it] 66%|██████▌ | 314/478 [1:24:32<40:52, 14.95s/it] {'loss': '0.09595', 'grad_norm': '5.312', 'learning_rate': '6.402e-06', 'ppl': '1.101', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.09', 'tokens/total': 10048000, 'tokens/trainable': 195408, 'epoch': '1.308'}
66%|██████▌ | 314/478 [1:24:32<40:52, 14.95s/it] 66%|██████▌ | 315/478 [1:24:47<40:37, 14.95s/it] {'loss': '0.06055', 'grad_norm': '3.203', 'learning_rate': '6.334e-06', 'ppl': '1.062', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.68', 'tokens/total': 10080000, 'tokens/trainable': 195996, 'epoch': '1.312'}
66%|██████▌ | 315/478 [1:24:47<40:37, 14.95s/it] 66%|██████▌ | 316/478 [1:25:02<40:22, 14.95s/it] {'loss': '0.08887', 'grad_norm': '3.938', 'learning_rate': '6.266e-06', 'ppl': '1.093', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.32', 'tokens/total': 10112000, 'tokens/trainable': 196603, 'epoch': '1.317'}
66%|██████▌ | 316/478 [1:25:02<40:22, 14.95s/it] 66%|██████▋ | 317/478 [1:25:17<40:07, 14.95s/it] {'loss': '0.1387', 'grad_norm': '6.281', 'learning_rate': '6.198e-06', 'ppl': '1.149', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.02', 'tokens/total': 10144000, 'tokens/trainable': 197171, 'epoch': '1.321'}
66%|██████▋ | 317/478 [1:25:17<40:07, 14.95s/it] 67%|██████▋ | 318/478 [1:25:32<39:52, 14.95s/it] {'loss': '0.07373', 'grad_norm': '2.984', 'learning_rate': '6.131e-06', 'ppl': '1.077', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22', 'tokens/total': 10176000, 'tokens/trainable': 197828, 'epoch': '1.325'}
67%|██████▋ | 318/478 [1:25:32<39:52, 14.95s/it] 67%|██████▋ | 319/478 [1:25:46<39:37, 14.95s/it] {'loss': '0.0835', 'grad_norm': '4.781', 'learning_rate': '6.064e-06', 'ppl': '1.087', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.03', 'tokens/total': 10208000, 'tokens/trainable': 198456, 'epoch': '1.329'}
67%|██████▋ | 319/478 [1:25:46<39:37, 14.95s/it] 67%|██████▋ | 320/478 [1:26:01<39:21, 14.94s/it] {'loss': '0.07983', 'grad_norm': '4.688', 'learning_rate': '5.997e-06', 'ppl': '1.083', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22', 'tokens/total': 10240000, 'tokens/trainable': 199112, 'epoch': '1.333'}
67%|██████▋ | 320/478 [1:26:01<39:21, 14.94s/it] 67%|██████▋ | 321/478 [1:26:16<39:05, 14.94s/it] {'loss': '0.1011', 'grad_norm': '5.469', 'learning_rate': '5.93e-06', 'ppl': '1.106', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.63', 'tokens/total': 10272000, 'tokens/trainable': 199787, 'epoch': '1.337'}
67%|██████▋ | 321/478 [1:26:16<39:05, 14.94s/it] 67%|██████▋ | 322/478 [1:26:31<38:51, 14.95s/it] {'loss': '0.0813', 'grad_norm': '4.906', 'learning_rate': '5.864e-06', 'ppl': '1.085', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.55', 'tokens/total': 10304000, 'tokens/trainable': 200401, 'epoch': '1.342'}
67%|██████▋ | 322/478 [1:26:31<38:51, 14.95s/it] 68%|██████▊ | 323/478 [1:26:46<38:36, 14.95s/it] {'loss': '0.06006', 'grad_norm': '3.531', 'learning_rate': '5.798e-06', 'ppl': '1.062', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '16.24', 'tokens/total': 10336000, 'tokens/trainable': 200886, 'epoch': '1.346'}
68%|██████▊ | 323/478 [1:26:46<38:36, 14.95s/it] 68%|██████▊ | 324/478 [1:27:01<38:22, 14.95s/it] {'loss': '0.08911', 'grad_norm': '4.469', 'learning_rate': '5.732e-06', 'ppl': '1.093', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.7', 'tokens/total': 10368000, 'tokens/trainable': 201564, 'epoch': '1.35'}
68%|██████▊ | 324/478 [1:27:01<38:22, 14.95s/it] 68%|██████▊ | 325/478 [1:27:16<38:06, 14.95s/it] {'loss': '0.06714', 'grad_norm': '4.344', 'learning_rate': '5.666e-06', 'ppl': '1.069', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.53', 'tokens/total': 10400000, 'tokens/trainable': 202147, 'epoch': '1.354'}
68%|██████▊ | 325/478 [1:27:16<38:06, 14.95s/it] 68%|██████▊ | 326/478 [1:27:31<37:52, 14.95s/it] {'loss': '0.1201', 'grad_norm': '5.5', 'learning_rate': '5.6e-06', 'ppl': '1.128', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '17.44', 'tokens/total': 10432000, 'tokens/trainable': 202668, 'epoch': '1.358'}
68%|██████▊ | 326/478 [1:27:31<37:52, 14.95s/it] 68%|██████▊ | 327/478 [1:27:46<37:37, 14.95s/it] {'loss': '0.09717', 'grad_norm': '4', 'learning_rate': '5.535e-06', 'ppl': '1.102', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.83', 'tokens/total': 10464000, 'tokens/trainable': 203290, 'epoch': '1.363'}
68%|██████▊ | 327/478 [1:27:46<37:37, 14.95s/it] 69%|██████▊ | 328/478 [1:28:01<37:22, 14.95s/it] {'loss': '0.08154', 'grad_norm': '4.406', 'learning_rate': '5.47e-06', 'ppl': '1.085', 'memory/max_active (GiB)': '45.17', 'memory/max_allocated (GiB)': '45.17', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '23.24', 'tokens/total': 10496000, 'tokens/trainable': 203984, 'epoch': '1.367'}
69%|██████▊ | 328/478 [1:28:01<37:22, 14.95s/it] 69%|██████▉ | 329/478 [1:28:16<37:07, 14.95s/it] {'loss': '0.09863', 'grad_norm': '4.188', 'learning_rate': '5.405e-06', 'ppl': '1.104', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.06', 'tokens/total': 10528000, 'tokens/trainable': 204613, 'epoch': '1.371'}
69%|██████▉ | 329/478 [1:28:16<37:07, 14.95s/it] 69%|██████▉ | 330/478 [1:28:31<36:52, 14.95s/it] {'loss': '0.1023', 'grad_norm': '4.531', 'learning_rate': '5.34e-06', 'ppl': '1.108', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '23.94', 'tokens/total': 10560000, 'tokens/trainable': 205328, 'epoch': '1.375'}
69%|██████▉ | 330/478 [1:28:31<36:52, 14.95s/it] 69%|██████▉ | 331/478 [1:28:46<36:37, 14.95s/it] {'loss': '0.125', 'grad_norm': '5.469', 'learning_rate': '5.276e-06', 'ppl': '1.133', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.52', 'tokens/total': 10592000, 'tokens/trainable': 205941, 'epoch': '1.379'}
69%|██████▉ | 331/478 [1:28:46<36:37, 14.95s/it] 69%|██████▉ | 332/478 [1:29:01<36:23, 14.95s/it] {'loss': '0.09619', 'grad_norm': '4.906', 'learning_rate': '5.212e-06', 'ppl': '1.101', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.62', 'tokens/total': 10624000, 'tokens/trainable': 206497, 'epoch': '1.383'}
69%|██████▉ | 332/478 [1:29:01<36:23, 14.95s/it] 70%|██████▉ | 333/478 [1:29:16<36:08, 14.95s/it] {'loss': '0.1157', 'grad_norm': '4.875', 'learning_rate': '5.148e-06', 'ppl': '1.123', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22', 'tokens/total': 10656000, 'tokens/trainable': 207154, 'epoch': '1.387'}
70%|██████▉ | 333/478 [1:29:16<36:08, 14.95s/it] 70%|██████▉ | 334/478 [1:29:31<35:52, 14.95s/it] {'loss': '0.07983', 'grad_norm': '5.969', 'learning_rate': '5.084e-06', 'ppl': '1.083', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '17.48', 'tokens/total': 10688000, 'tokens/trainable': 207676, 'epoch': '1.392'}
70%|██████▉ | 334/478 [1:29:31<35:52, 14.95s/it] 70%|███████ | 335/478 [1:29:46<35:38, 14.95s/it] {'loss': '0.1125', 'grad_norm': '5.75', 'learning_rate': '5.021e-06', 'ppl': '1.119', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '16.1', 'tokens/total': 10720000, 'tokens/trainable': 208157, 'epoch': '1.396'}
70%|███████ | 335/478 [1:29:46<35:38, 14.95s/it] 70%|███████ | 336/478 [1:30:01<35:23, 14.95s/it] {'loss': '0.08423', 'grad_norm': '5.094', 'learning_rate': '4.958e-06', 'ppl': '1.088', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '16.7', 'tokens/total': 10752000, 'tokens/trainable': 208656, 'epoch': '1.4'}
70%|███████ | 336/478 [1:30:01<35:23, 14.95s/it] 71%|███████ | 337/478 [1:30:16<35:08, 14.95s/it] {'loss': '0.1116', 'grad_norm': '5.281', 'learning_rate': '4.895e-06', 'ppl': '1.118', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.7', 'tokens/total': 10784000, 'tokens/trainable': 209274, 'epoch': '1.404'}
71%|███████ | 337/478 [1:30:16<35:08, 14.95s/it] 71%|███████ | 338/478 [1:30:31<34:53, 14.95s/it] {'loss': '0.07593', 'grad_norm': '4.062', 'learning_rate': '4.833e-06', 'ppl': '1.079', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.89', 'tokens/total': 10816000, 'tokens/trainable': 209898, 'epoch': '1.408'}
71%|███████ | 338/478 [1:30:31<34:53, 14.95s/it] 71%|███████ | 339/478 [1:30:45<34:39, 14.96s/it] {'loss': '0.06348', 'grad_norm': '3.125', 'learning_rate': '4.77e-06', 'ppl': '1.066', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.86', 'tokens/total': 10848000, 'tokens/trainable': 210582, 'epoch': '1.413'}
71%|███████ | 339/478 [1:30:46<34:39, 14.96s/it] 71%|███████ | 340/478 [1:31:00<34:24, 14.96s/it] {'loss': '0.09473', 'grad_norm': '4.094', 'learning_rate': '4.708e-06', 'ppl': '1.099', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.6', 'tokens/total': 10880000, 'tokens/trainable': 211197, 'epoch': '1.417'}
71%|███████ | 340/478 [1:31:00<34:24, 14.96s/it] 71%|███████▏ | 341/478 [1:31:15<34:08, 14.96s/it] {'loss': '0.1062', 'grad_norm': '5.125', 'learning_rate': '4.647e-06', 'ppl': '1.112', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.61', 'tokens/total': 10912000, 'tokens/trainable': 211753, 'epoch': '1.421'}
71%|███████▏ | 341/478 [1:31:15<34:08, 14.96s/it] 72%|███████▏ | 342/478 [1:31:30<33:54, 14.96s/it] {'loss': '0.08618', 'grad_norm': '4.469', 'learning_rate': '4.585e-06', 'ppl': '1.09', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.26', 'tokens/total': 10944000, 'tokens/trainable': 212388, 'epoch': '1.425'}
72%|███████▏ | 342/478 [1:31:30<33:54, 14.96s/it] 72%|███████▏ | 343/478 [1:31:45<33:38, 14.96s/it] {'loss': '0.07764', 'grad_norm': '5.062', 'learning_rate': '4.524e-06', 'ppl': '1.081', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '19.42', 'tokens/total': 10976000, 'tokens/trainable': 212968, 'epoch': '1.429'}
72%|███████▏ | 343/478 [1:31:45<33:38, 14.96s/it] 72%|███████▏ | 344/478 [1:32:00<33:24, 14.96s/it] {'loss': '0.1353', 'grad_norm': '5.312', 'learning_rate': '4.463e-06', 'ppl': '1.145', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.88', 'tokens/total': 11008000, 'tokens/trainable': 213592, 'epoch': '1.433'}
72%|███████▏ | 344/478 [1:32:00<33:24, 14.96s/it] 72%|███████▏ | 345/478 [1:32:15<33:08, 14.95s/it] {'loss': '0.07617', 'grad_norm': '3.844', 'learning_rate': '4.403e-06', 'ppl': '1.079', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.47', 'tokens/total': 11040000, 'tokens/trainable': 214233, 'epoch': '1.438'}
72%|███████▏ | 345/478 [1:32:15<33:08, 14.95s/it] 72%|███████▏ | 346/478 [1:32:30<32:53, 14.95s/it] {'loss': '0.08276', 'grad_norm': '4.844', 'learning_rate': '4.342e-06', 'ppl': '1.086', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '23.3', 'tokens/total': 11072000, 'tokens/trainable': 214929, 'epoch': '1.442'}
72%|███████▏ | 346/478 [1:32:30<32:53, 14.95s/it] 73%|███████▎ | 347/478 [1:32:45<32:38, 14.95s/it] {'loss': '0.1309', 'grad_norm': '5.031', 'learning_rate': '4.282e-06', 'ppl': '1.14', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.93', 'tokens/total': 11104000, 'tokens/trainable': 215554, 'epoch': '1.446'}
73%|███████▎ | 347/478 [1:32:45<32:38, 14.95s/it] 73%|███████▎ | 348/478 [1:33:00<32:23, 14.95s/it] {'loss': '0.0874', 'grad_norm': '4.156', 'learning_rate': '4.223e-06', 'ppl': '1.091', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.43', 'tokens/total': 11136000, 'tokens/trainable': 216164, 'epoch': '1.45'}
73%|███████▎ | 348/478 [1:33:00<32:23, 14.95s/it] 73%|███████▎ | 349/478 [1:33:15<32:08, 14.95s/it] {'loss': '0.1606', 'grad_norm': '8', 'learning_rate': '4.164e-06', 'ppl': '1.174', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.89', 'tokens/total': 11168000, 'tokens/trainable': 216728, 'epoch': '1.454'}
73%|███████▎ | 349/478 [1:33:15<32:08, 14.95s/it] 73%|███████▎ | 350/478 [1:33:30<31:53, 14.95s/it] {'loss': '0.1328', 'grad_norm': '5.344', 'learning_rate': '4.104e-06', 'ppl': '1.142', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.63', 'tokens/total': 11200000, 'tokens/trainable': 217374, 'epoch': '1.458'}
73%|███████▎ | 350/478 [1:33:30<31:53, 14.95s/it] 73%|███████▎ | 351/478 [1:33:45<31:39, 14.95s/it] {'loss': '0.07727', 'grad_norm': '3.109', 'learning_rate': '4.046e-06', 'ppl': '1.08', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.99', 'tokens/total': 11232000, 'tokens/trainable': 218031, 'epoch': '1.462'}
73%|███████▎ | 351/478 [1:33:45<31:39, 14.95s/it] 74%|███████▎ | 352/478 [1:34:00<31:24, 14.95s/it] {'loss': '0.1194', 'grad_norm': '5.219', 'learning_rate': '3.987e-06', 'ppl': '1.127', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.88', 'tokens/total': 11264000, 'tokens/trainable': 218595, 'epoch': '1.467'}
74%|███████▎ | 352/478 [1:34:00<31:24, 14.95s/it] 74%|███████▍ | 353/478 [1:34:15<31:06, 14.94s/it] {'loss': '0.1445', 'grad_norm': '5.031', 'learning_rate': '3.929e-06', 'ppl': '1.155', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.72', 'tokens/total': 11296000, 'tokens/trainable': 219241, 'epoch': '1.471'}
74%|███████▍ | 353/478 [1:34:15<31:06, 14.94s/it] 74%|███████▍ | 354/478 [1:34:30<30:52, 14.94s/it] {'loss': '0.0697', 'grad_norm': '4.438', 'learning_rate': '3.872e-06', 'ppl': '1.072', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.61', 'tokens/total': 11328000, 'tokens/trainable': 219916, 'epoch': '1.475'}
74%|███████▍ | 354/478 [1:34:30<30:52, 14.94s/it] 74%|███████▍ | 355/478 [1:34:45<30:38, 14.94s/it] {'loss': '0.1052', 'grad_norm': '5.062', 'learning_rate': '3.814e-06', 'ppl': '1.111', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '20.15', 'tokens/total': 11360000, 'tokens/trainable': 220518, 'epoch': '1.479'}
74%|███████▍ | 355/478 [1:34:45<30:38, 14.94s/it] 74%|███████▍ | 356/478 [1:35:00<30:23, 14.95s/it] {'loss': '0.08936', 'grad_norm': '4.531', 'learning_rate': '3.757e-06', 'ppl': '1.093', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '18.04', 'tokens/total': 11392000, 'tokens/trainable': 221057, 'epoch': '1.483'}
74%|███████▍ | 356/478 [1:35:00<30:23, 14.95s/it] 75%|███████▍ | 357/478 [1:35:15<30:08, 14.95s/it] {'loss': '0.1221', 'grad_norm': '4.969', 'learning_rate': '3.7e-06', 'ppl': '1.13', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.69', 'tokens/total': 11424000, 'tokens/trainable': 221705, 'epoch': '1.488'}
75%|███████▍ | 357/478 [1:35:15<30:08, 14.95s/it] 75%|███████▍ | 358/478 [1:35:30<29:54, 14.95s/it] {'loss': '0.09595', 'grad_norm': '4.25', 'learning_rate': '3.644e-06', 'ppl': '1.101', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '21.76', 'tokens/total': 11456000, 'tokens/trainable': 222355, 'epoch': '1.492'}
75%|███████▍ | 358/478 [1:35:30<29:54, 14.95s/it] 75%|███████▌ | 359/478 [1:35:45<29:39, 14.95s/it] {'loss': '0.1006', 'grad_norm': '5.406', 'learning_rate': '3.588e-06', 'ppl': '1.106', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.99', 'tokens/total': 11488000, 'tokens/trainable': 223042, 'epoch': '1.496'}
75%|███████▌ | 359/478 [1:35:45<29:39, 14.95s/it] 75%|███████▌ | 360/478 [1:35:59<29:24, 14.95s/it] {'loss': '0.08984', 'grad_norm': '4.344', 'learning_rate': '3.532e-06', 'ppl': '1.094', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.54', 'tokens/train_per_sec_per_gpu': '22.26', 'tokens/total': 11520000, 'tokens/trainable': 223707, 'epoch': '1.5'}
75%|███████▌ | 360/478 [1:35:59<29:24, 14.95s/it][2026-04-17 03:45:24,126] [INFO] [axolotl.core.trainers.base] Running evaluation step...
[2026-04-17 03:45:31,509] [INFO] [axolotl.utils.samplers.multipack] gather_len_batches: [54, 54]
0%| | 0/27 [00:00<?, ?it/s]
7%|▋ | 2/27 [00:02<00:35, 1.41s/it]
11%|█ | 3/27 [00:05<00:47, 1.97s/it]
15%|█▍ | 4/27 [00:08<00:51, 2.26s/it]
19%|█▊ | 5/27 [00:11<00:53, 2.43s/it]
22%|██▏ | 6/27 [00:13<00:53, 2.54s/it]
26%|██▌ | 7/27 [00:16<00:52, 2.61s/it]
30%|██▉ | 8/27 [00:19<00:50, 2.65s/it]
33%|███▎ | 9/27 [00:22<00:48, 2.68s/it]
37%|███▋ | 10/27 [00:24<00:45, 2.70s/it]
41%|████ | 11/27 [00:27<00:43, 2.72s/it]
44%|████▍ | 12/27 [00:30<00:39, 2.64s/it]
48%|████▊ | 13/27 [00:33<00:38, 2.76s/it]
52%|█████▏ | 14/27 [00:35<00:35, 2.76s/it]
56%|█████▌ | 15/27 [00:38<00:33, 2.76s/it]
59%|█████▉ | 16/27 [00:41<00:30, 2.76s/it]
63%|██████▎ | 17/27 [00:44<00:27, 2.75s/it]
67%|██████▋ | 18/27 [00:46<00:24, 2.75s/it]
70%|███████ | 19/27 [00:49<00:22, 2.75s/it]
74%|███████▍ | 20/27 [00:52<00:19, 2.75s/it]
78%|███████▊ | 21/27 [00:54<00:15, 2.67s/it]
81%|████████▏ | 22/27 [00:57<00:13, 2.78s/it]
85%|████████▌ | 23/27 [01:00<00:11, 2.77s/it]
89%|████████▉ | 24/27 [01:03<00:08, 2.77s/it]
93%|█████████▎| 25/27 [01:06<00:05, 2.76s/it]
96%|█████████▋| 26/27 [01:08<00:02, 2.76s/it]
100%|██████████| 27/27 [01:11<00:00, 2.79s/it]
{'eval_loss': '0.2251', 'eval_runtime': '75.08', 'eval_samples_per_second': '2.784', 'eval_steps_per_second': '1.398', 'eval_ppl': '1.252', 'memory/max_active (GiB)': '34.91', 'memory/max_allocated (GiB)': '34.91', 'memory/device_reserved (GiB)': '57.54', 'epoch': '1.5', 'tokens/train_per_sec_per_gpu': '0'}
75%|███████▌ | 360/478 [1:37:22<29:24, 14.95s/it]
100%|██████████| 27/27 [01:13<00:00, 2.79s/it]
76%|███████▌ | 361/478 [1:37:37<1:17:25, 39.71s/it] {'loss': '0.09912', 'grad_norm': '4.531', 'learning_rate': '3.476e-06', 'ppl': '1.104', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '17.6', 'tokens/total': 11552000, 'tokens/trainable': 224233, 'epoch': '1.504'}
76%|███████▌ | 361/478 [1:37:37<1:17:25, 39.71s/it] 76%|███████▌ | 362/478 [1:37:52<1:02:24, 32.28s/it] {'loss': '0.08459', 'grad_norm': '4.062', 'learning_rate': '3.421e-06', 'ppl': '1.088', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '19.69', 'tokens/total': 11584000, 'tokens/trainable': 224821, 'epoch': '1.508'}
76%|███████▌ | 362/478 [1:37:52<1:02:24, 32.28s/it] 76%|███████▌ | 363/478 [1:38:07<51:55, 27.09s/it] {'loss': '0.1008', 'grad_norm': '4.812', 'learning_rate': '3.367e-06', 'ppl': '1.106', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '17.31', 'tokens/total': 11616000, 'tokens/trainable': 225338, 'epoch': '1.512'}
76%|███████▌ | 363/478 [1:38:07<51:55, 27.09s/it] 76%|███████▌ | 364/478 [1:38:22<44:32, 23.45s/it] {'loss': '0.1174', 'grad_norm': '5.188', 'learning_rate': '3.312e-06', 'ppl': '1.125', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '17.41', 'tokens/total': 11648000, 'tokens/trainable': 225858, 'epoch': '1.517'}
76%|███████▌ | 364/478 [1:38:22<44:32, 23.45s/it] 76%|███████▋ | 365/478 [1:38:37<39:21, 20.90s/it] {'loss': '0.1138', 'grad_norm': '5.594', 'learning_rate': '3.258e-06', 'ppl': '1.12', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '20.13', 'tokens/total': 11680000, 'tokens/trainable': 226459, 'epoch': '1.521'}
76%|███████▋ | 365/478 [1:38:37<39:21, 20.90s/it] 77%|███████▋ | 366/478 [1:38:52<35:40, 19.11s/it] {'loss': '0.1018', 'grad_norm': '4.125', 'learning_rate': '3.205e-06', 'ppl': '1.107', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.9', 'tokens/total': 11712000, 'tokens/trainable': 227113, 'epoch': '1.525'}
77%|███████▋ | 366/478 [1:38:52<35:40, 19.11s/it] 77%|███████▋ | 367/478 [1:39:07<33:02, 17.86s/it] {'loss': '0.09009', 'grad_norm': '4.031', 'learning_rate': '3.151e-06', 'ppl': '1.094', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.33', 'tokens/total': 11744000, 'tokens/trainable': 227750, 'epoch': '1.529'}
77%|███████▋ | 367/478 [1:39:07<33:02, 17.86s/it] 77%|███████▋ | 368/478 [1:39:22<31:09, 16.99s/it] {'loss': '0.09216', 'grad_norm': '4.25', 'learning_rate': '3.098e-06', 'ppl': '1.097', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '18.14', 'tokens/total': 11776000, 'tokens/trainable': 228292, 'epoch': '1.533'}
77%|███████▋ | 368/478 [1:39:22<31:09, 16.99s/it] 77%|███████▋ | 369/478 [1:39:37<29:45, 16.38s/it] {'loss': '0.115', 'grad_norm': '5.594', 'learning_rate': '3.046e-06', 'ppl': '1.122', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '19.96', 'tokens/total': 11808000, 'tokens/trainable': 228888, 'epoch': '1.538'}
77%|███████▋ | 369/478 [1:39:37<29:45, 16.38s/it] 77%|███████▋ | 370/478 [1:39:51<28:42, 15.95s/it] {'loss': '0.09424', 'grad_norm': '4.344', 'learning_rate': '2.994e-06', 'ppl': '1.099', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '20', 'tokens/total': 11840000, 'tokens/trainable': 229485, 'epoch': '1.542'}
77%|███████▋ | 370/478 [1:39:52<28:42, 15.95s/it] 78%|███████▊ | 371/478 [1:40:06<27:54, 15.65s/it] {'loss': '0.1067', 'grad_norm': '6.469', 'learning_rate': '2.942e-06', 'ppl': '1.113', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '17.93', 'tokens/total': 11872000, 'tokens/trainable': 230020, 'epoch': '1.546'}
78%|███████▊ | 371/478 [1:40:06<27:54, 15.65s/it] 78%|███████▊ | 372/478 [1:40:21<27:16, 15.44s/it] {'loss': '0.1021', 'grad_norm': '5.469', 'learning_rate': '2.89e-06', 'ppl': '1.107', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '18.49', 'tokens/total': 11904000, 'tokens/trainable': 230572, 'epoch': '1.55'}
78%|███████▊ | 372/478 [1:40:21<27:16, 15.44s/it] 78%|███████▊ | 373/478 [1:40:36<26:44, 15.28s/it] {'loss': '0.1387', 'grad_norm': '4.656', 'learning_rate': '2.839e-06', 'ppl': '1.149', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '23.25', 'tokens/total': 11936000, 'tokens/trainable': 231265, 'epoch': '1.554'}
78%|███████▊ | 373/478 [1:40:36<26:44, 15.28s/it] 78%|███████▊ | 374/478 [1:40:51<26:20, 15.20s/it] {'loss': '0.137', 'grad_norm': '5.344', 'learning_rate': '2.789e-06', 'ppl': '1.147', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '22.97', 'tokens/total': 11968000, 'tokens/trainable': 231953, 'epoch': '1.558'}
78%|███████▊ | 374/478 [1:40:51<26:20, 15.20s/it] 78%|███████▊ | 375/478 [1:41:06<25:57, 15.12s/it] {'loss': '0.09937', 'grad_norm': '3.984', 'learning_rate': '2.738e-06', 'ppl': '1.104', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '22.44', 'tokens/total': 12000000, 'tokens/trainable': 232623, 'epoch': '1.562'}
78%|███████▊ | 375/478 [1:41:06<25:57, 15.12s/it] 79%|███████▊ | 376/478 [1:41:21<25:37, 15.07s/it] {'loss': '0.116', 'grad_norm': '4.875', 'learning_rate': '2.688e-06', 'ppl': '1.123', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '20.19', 'tokens/total': 12032000, 'tokens/trainable': 233226, 'epoch': '1.567'}
79%|███████▊ | 376/478 [1:41:21<25:37, 15.07s/it] 79%|███████▉ | 377/478 [1:41:36<25:17, 15.03s/it] {'loss': '0.1116', 'grad_norm': '5.531', 'learning_rate': '2.639e-06', 'ppl': '1.118', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '20.79', 'tokens/total': 12064000, 'tokens/trainable': 233846, 'epoch': '1.571'}
79%|███████▉ | 377/478 [1:41:36<25:17, 15.03s/it] 79%|███████▉ | 378/478 [1:41:51<25:00, 15.01s/it] {'loss': '0.1494', 'grad_norm': '6.625', 'learning_rate': '2.59e-06', 'ppl': '1.161', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '19.32', 'tokens/total': 12096000, 'tokens/trainable': 234423, 'epoch': '1.575'}
79%|███████▉ | 378/478 [1:41:51<25:00, 15.01s/it] 79%|███████▉ | 379/478 [1:42:06<24:43, 14.99s/it] {'loss': '0.07104', 'grad_norm': '3.75', 'learning_rate': '2.541e-06', 'ppl': '1.074', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '20.13', 'tokens/total': 12128000, 'tokens/trainable': 235024, 'epoch': '1.579'}
79%|███████▉ | 379/478 [1:42:06<24:43, 14.99s/it] 79%|███████▉ | 380/478 [1:42:21<24:27, 14.98s/it] {'loss': '0.09961', 'grad_norm': '5.688', 'learning_rate': '2.493e-06', 'ppl': '1.105', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '16.68', 'tokens/total': 12160000, 'tokens/trainable': 235522, 'epoch': '1.583'}
79%|███████▉ | 380/478 [1:42:21<24:27, 14.98s/it] 80%|███████▉ | 381/478 [1:42:36<24:10, 14.95s/it] {'loss': '0.1062', 'grad_norm': '4.531', 'learning_rate': '2.445e-06', 'ppl': '1.112', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.69', 'tokens/total': 12192000, 'tokens/trainable': 236167, 'epoch': '1.587'}
80%|███████▉ | 381/478 [1:42:36<24:10, 14.95s/it] 80%|███████▉ | 382/478 [1:42:51<23:55, 14.95s/it] {'loss': '0.1431', 'grad_norm': '5.75', 'learning_rate': '2.397e-06', 'ppl': '1.154', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.84', 'tokens/total': 12224000, 'tokens/trainable': 236819, 'epoch': '1.592'}
80%|███████▉ | 382/478 [1:42:51<23:55, 14.95s/it] 80%|████████ | 383/478 [1:43:06<23:40, 14.95s/it] {'loss': '0.1384', 'grad_norm': '5.062', 'learning_rate': '2.35e-06', 'ppl': '1.148', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.09', 'tokens/total': 12256000, 'tokens/trainable': 237449, 'epoch': '1.596'}
80%|████████ | 383/478 [1:43:06<23:40, 14.95s/it] 80%|████████ | 384/478 [1:43:21<23:25, 14.95s/it] {'loss': '0.09216', 'grad_norm': '4.125', 'learning_rate': '2.303e-06', 'ppl': '1.097', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '18.75', 'tokens/total': 12288000, 'tokens/trainable': 238009, 'epoch': '1.6'}
80%|████████ | 384/478 [1:43:21<23:25, 14.95s/it] 81%|████████ | 385/478 [1:43:36<23:10, 14.95s/it] {'loss': '0.09546', 'grad_norm': '4.75', 'learning_rate': '2.257e-06', 'ppl': '1.1', 'memory/max_active (GiB)': '45.17', 'memory/max_allocated (GiB)': '45.17', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '19.33', 'tokens/total': 12320000, 'tokens/trainable': 238586, 'epoch': '1.604'}
81%|████████ | 385/478 [1:43:36<23:10, 14.95s/it] 81%|████████ | 386/478 [1:43:51<22:54, 14.94s/it] {'loss': '0.06152', 'grad_norm': '3.656', 'learning_rate': '2.211e-06', 'ppl': '1.063', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '18.85', 'tokens/total': 12352000, 'tokens/trainable': 239148, 'epoch': '1.608'}
81%|████████ | 386/478 [1:43:51<22:54, 14.94s/it] 81%|████████ | 387/478 [1:44:06<22:39, 14.94s/it] {'loss': '0.09692', 'grad_norm': '4.656', 'learning_rate': '2.165e-06', 'ppl': '1.102', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '20.66', 'tokens/total': 12384000, 'tokens/trainable': 239765, 'epoch': '1.613'}
81%|████████ | 387/478 [1:44:06<22:39, 14.94s/it] 81%|████████ | 388/478 [1:44:20<22:25, 14.95s/it] {'loss': '0.1025', 'grad_norm': '4.562', 'learning_rate': '2.12e-06', 'ppl': '1.108', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '22.3', 'tokens/total': 12416000, 'tokens/trainable': 240431, 'epoch': '1.617'}
81%|████████ | 388/478 [1:44:21<22:25, 14.95s/it] 81%|████████▏ | 389/478 [1:44:35<22:10, 14.95s/it] {'loss': '0.09546', 'grad_norm': '4.188', 'learning_rate': '2.076e-06', 'ppl': '1.1', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '19.79', 'tokens/total': 12448000, 'tokens/trainable': 241022, 'epoch': '1.621'}
81%|████████▏ | 389/478 [1:44:35<22:10, 14.95s/it] 82%|████████▏ | 390/478 [1:44:50<21:55, 14.95s/it] {'loss': '0.07373', 'grad_norm': '3.828', 'learning_rate': '2.031e-06', 'ppl': '1.077', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '18.79', 'tokens/total': 12480000, 'tokens/trainable': 241583, 'epoch': '1.625'}
82%|████████▏ | 390/478 [1:44:50<21:55, 14.95s/it] 82%|████████▏ | 391/478 [1:45:05<21:40, 14.95s/it] {'loss': '0.09595', 'grad_norm': '3.969', 'learning_rate': '1.988e-06', 'ppl': '1.101', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '18.89', 'tokens/total': 12512000, 'tokens/trainable': 242147, 'epoch': '1.629'}
82%|████████▏ | 391/478 [1:45:05<21:40, 14.95s/it] 82%|████████▏ | 392/478 [1:45:20<21:25, 14.95s/it] {'loss': '0.1494', 'grad_norm': '5.188', 'learning_rate': '1.944e-06', 'ppl': '1.161', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '22.65', 'tokens/total': 12544000, 'tokens/trainable': 242823, 'epoch': '1.633'}
82%|████████▏ | 392/478 [1:45:20<21:25, 14.95s/it] 82%|████████▏ | 393/478 [1:45:35<21:10, 14.95s/it] {'loss': '0.1069', 'grad_norm': '5.5', 'learning_rate': '1.901e-06', 'ppl': '1.113', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '22.85', 'tokens/total': 12576000, 'tokens/trainable': 243505, 'epoch': '1.637'}
82%|████████▏ | 393/478 [1:45:35<21:10, 14.95s/it] 82%|████████▏ | 394/478 [1:45:50<20:55, 14.95s/it] {'loss': '0.08936', 'grad_norm': '5.531', 'learning_rate': '1.859e-06', 'ppl': '1.093', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '20.29', 'tokens/total': 12608000, 'tokens/trainable': 244111, 'epoch': '1.642'}
82%|████████▏ | 394/478 [1:45:50<20:55, 14.95s/it] 83%|████████▎ | 395/478 [1:46:05<20:40, 14.95s/it] {'loss': '0.1116', 'grad_norm': '4.312', 'learning_rate': '1.817e-06', 'ppl': '1.118', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '19.6', 'tokens/total': 12640000, 'tokens/trainable': 244696, 'epoch': '1.646'}
83%|████████▎ | 395/478 [1:46:05<20:40, 14.95s/it] 83%|████████▎ | 396/478 [1:46:20<20:25, 14.94s/it] {'loss': '0.1196', 'grad_norm': '4.875', 'learning_rate': '1.775e-06', 'ppl': '1.127', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '23.28', 'tokens/total': 12672000, 'tokens/trainable': 245390, 'epoch': '1.65'}
83%|████████▎ | 396/478 [1:46:20<20:25, 14.94s/it] 83%|████████▎ | 397/478 [1:46:35<20:10, 14.95s/it] {'loss': '0.1062', 'grad_norm': '3.922', 'learning_rate': '1.734e-06', 'ppl': '1.112', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '22.29', 'tokens/total': 12704000, 'tokens/trainable': 246056, 'epoch': '1.654'}
83%|████████▎ | 397/478 [1:46:35<20:10, 14.95s/it] 83%|████████▎ | 398/478 [1:46:50<19:55, 14.95s/it] {'loss': '0.124', 'grad_norm': '4.469', 'learning_rate': '1.693e-06', 'ppl': '1.132', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.6', 'tokens/total': 12736000, 'tokens/trainable': 246701, 'epoch': '1.658'}
83%|████████▎ | 398/478 [1:46:50<19:55, 14.95s/it] 83%|████████▎ | 399/478 [1:47:05<19:40, 14.95s/it] {'loss': '0.07373', 'grad_norm': '4.344', 'learning_rate': '1.653e-06', 'ppl': '1.077', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '18.28', 'tokens/total': 12768000, 'tokens/trainable': 247247, 'epoch': '1.663'}
83%|████████▎ | 399/478 [1:47:05<19:40, 14.95s/it] 84%|████████▎ | 400/478 [1:47:20<19:25, 14.95s/it] {'loss': '0.08398', 'grad_norm': '3.781', 'learning_rate': '1.613e-06', 'ppl': '1.088', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '23.24', 'tokens/total': 12800000, 'tokens/trainable': 247941, 'epoch': '1.667'}
84%|████████▎ | 400/478 [1:47:20<19:25, 14.95s/it] 84%|████████▍ | 401/478 [1:47:35<19:11, 14.95s/it] {'loss': '0.09668', 'grad_norm': '5.562', 'learning_rate': '1.573e-06', 'ppl': '1.102', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '17.41', 'tokens/total': 12832000, 'tokens/trainable': 248461, 'epoch': '1.671'}
84%|████████▍ | 401/478 [1:47:35<19:11, 14.95s/it] 84%|████████▍ | 402/478 [1:47:50<18:56, 14.95s/it] {'loss': '0.1016', 'grad_norm': '4.344', 'learning_rate': '1.534e-06', 'ppl': '1.107', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.1', 'tokens/total': 12864000, 'tokens/trainable': 249091, 'epoch': '1.675'}
84%|████████▍ | 402/478 [1:47:50<18:56, 14.95s/it] 84%|████████▍ | 403/478 [1:48:05<18:41, 14.95s/it] {'loss': '0.127', 'grad_norm': '5.969', 'learning_rate': '1.496e-06', 'ppl': '1.135', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.63', 'tokens/total': 12896000, 'tokens/trainable': 249737, 'epoch': '1.679'}
84%|████████▍ | 403/478 [1:48:05<18:41, 14.95s/it] 85%|████████▍ | 404/478 [1:48:20<18:26, 14.95s/it] {'loss': '0.09387', 'grad_norm': '4.5', 'learning_rate': '1.457e-06', 'ppl': '1.098', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '16.68', 'tokens/total': 12928000, 'tokens/trainable': 250235, 'epoch': '1.683'}
85%|████████▍ | 404/478 [1:48:20<18:26, 14.95s/it] 85%|████████▍ | 405/478 [1:48:35<18:11, 14.95s/it] {'loss': '0.1504', 'grad_norm': '6.469', 'learning_rate': '1.42e-06', 'ppl': '1.162', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '22.6', 'tokens/total': 12960000, 'tokens/trainable': 250910, 'epoch': '1.688'}
85%|████████▍ | 405/478 [1:48:35<18:11, 14.95s/it] 85%|████████▍ | 406/478 [1:48:50<17:56, 14.95s/it] {'loss': '0.08569', 'grad_norm': '5.438', 'learning_rate': '1.383e-06', 'ppl': '1.089', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '18.58', 'tokens/total': 12992000, 'tokens/trainable': 251465, 'epoch': '1.692'}
85%|████████▍ | 406/478 [1:48:50<17:56, 14.95s/it] 85%|████████▌ | 407/478 [1:49:04<17:41, 14.94s/it] {'loss': '0.1079', 'grad_norm': '4.562', 'learning_rate': '1.346e-06', 'ppl': '1.114', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.03', 'tokens/total': 13024000, 'tokens/trainable': 252092, 'epoch': '1.696'}
85%|████████▌ | 407/478 [1:49:05<17:41, 14.94s/it] 85%|████████▌ | 408/478 [1:49:19<17:26, 14.94s/it] {'loss': '0.105', 'grad_norm': '4.719', 'learning_rate': '1.31e-06', 'ppl': '1.111', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '19.23', 'tokens/total': 13056000, 'tokens/trainable': 252666, 'epoch': '1.7'}
85%|████████▌ | 408/478 [1:49:19<17:26, 14.94s/it] 86%|████████▌ | 409/478 [1:49:34<17:11, 14.95s/it] {'loss': '0.1016', 'grad_norm': '5.094', 'learning_rate': '1.274e-06', 'ppl': '1.107', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.03', 'tokens/total': 13088000, 'tokens/trainable': 253294, 'epoch': '1.704'}
86%|████████▌ | 409/478 [1:49:34<17:11, 14.95s/it] 86%|████████▌ | 410/478 [1:49:49<16:56, 14.95s/it] {'loss': '0.1023', 'grad_norm': '4.938', 'learning_rate': '1.238e-06', 'ppl': '1.108', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '20.95', 'tokens/total': 13120000, 'tokens/trainable': 253920, 'epoch': '1.708'}
86%|████████▌ | 410/478 [1:49:49<16:56, 14.95s/it] 86%|████████▌ | 411/478 [1:50:04<16:42, 14.96s/it] {'loss': '0.1167', 'grad_norm': '5.812', 'learning_rate': '1.203e-06', 'ppl': '1.124', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '20.77', 'tokens/total': 13152000, 'tokens/trainable': 254541, 'epoch': '1.712'}
86%|████████▌ | 411/478 [1:50:04<16:42, 14.96s/it] 86%|████████▌ | 412/478 [1:50:19<16:26, 14.95s/it] {'loss': '0.1777', 'grad_norm': '5.562', 'learning_rate': '1.169e-06', 'ppl': '1.195', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '22.07', 'tokens/total': 13184000, 'tokens/trainable': 255200, 'epoch': '1.717'}
86%|████████▌ | 412/478 [1:50:19<16:26, 14.95s/it] 86%|████████▋ | 413/478 [1:50:34<16:12, 14.95s/it] {'loss': '0.1094', 'grad_norm': '4.25', 'learning_rate': '1.135e-06', 'ppl': '1.116', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '20.79', 'tokens/total': 13216000, 'tokens/trainable': 255821, 'epoch': '1.721'}
86%|████████▋ | 413/478 [1:50:34<16:12, 14.95s/it] 87%|████████▋ | 414/478 [1:50:49<15:57, 14.96s/it] {'loss': '0.1216', 'grad_norm': '5', 'learning_rate': '1.102e-06', 'ppl': '1.129', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '24.83', 'tokens/total': 13248000, 'tokens/trainable': 256563, 'epoch': '1.725'}
87%|████████▋ | 414/478 [1:50:49<15:57, 14.96s/it] 87%|████████▋ | 415/478 [1:51:04<15:42, 14.95s/it] {'loss': '0.1398', 'grad_norm': '5', 'learning_rate': '1.069e-06', 'ppl': '1.15', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21', 'tokens/total': 13280000, 'tokens/trainable': 257190, 'epoch': '1.729'}
87%|████████▋ | 415/478 [1:51:04<15:42, 14.95s/it] 87%|████████▋ | 416/478 [1:51:19<15:26, 14.95s/it] {'loss': '0.1499', 'grad_norm': '5.219', 'learning_rate': '1.036e-06', 'ppl': '1.162', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '24.32', 'tokens/total': 13312000, 'tokens/trainable': 257915, 'epoch': '1.733'}
87%|████████▋ | 416/478 [1:51:19<15:26, 14.95s/it] 87%|████████▋ | 417/478 [1:51:34<15:11, 14.95s/it] {'loss': '0.1335', 'grad_norm': '5.312', 'learning_rate': '1.004e-06', 'ppl': '1.143', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '20.39', 'tokens/total': 13344000, 'tokens/trainable': 258524, 'epoch': '1.738'}
87%|████████▋ | 417/478 [1:51:34<15:11, 14.95s/it] 87%|████████▋ | 418/478 [1:51:49<14:56, 14.95s/it] {'loss': '0.1001', 'grad_norm': '3.969', 'learning_rate': '9.723e-07', 'ppl': '1.105', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '22.87', 'tokens/total': 13376000, 'tokens/trainable': 259207, 'epoch': '1.742'}
87%|████████▋ | 418/478 [1:51:49<14:56, 14.95s/it] 88%|████████▊ | 419/478 [1:52:04<14:41, 14.95s/it] {'loss': '0.09399', 'grad_norm': '4.562', 'learning_rate': '9.412e-07', 'ppl': '1.099', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '17.58', 'tokens/total': 13408000, 'tokens/trainable': 259732, 'epoch': '1.746'}
88%|████████▊ | 419/478 [1:52:04<14:41, 14.95s/it] 88%|████████▊ | 420/478 [1:52:19<14:26, 14.95s/it] {'loss': '0.08618', 'grad_norm': '4.438', 'learning_rate': '9.106e-07', 'ppl': '1.09', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '19.46', 'tokens/total': 13440000, 'tokens/trainable': 260313, 'epoch': '1.75'}
88%|████████▊ | 420/478 [1:52:19<14:26, 14.95s/it] 88%|████████▊ | 421/478 [1:52:34<14:12, 14.95s/it] {'loss': '0.1147', 'grad_norm': '4.719', 'learning_rate': '8.804e-07', 'ppl': '1.122', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '20.85', 'tokens/total': 13472000, 'tokens/trainable': 260936, 'epoch': '1.754'}
88%|████████▊ | 421/478 [1:52:34<14:12, 14.95s/it] 88%|████████▊ | 422/478 [1:52:49<13:57, 14.95s/it] {'loss': '0.08655', 'grad_norm': '4.156', 'learning_rate': '8.508e-07', 'ppl': '1.09', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '18.58', 'tokens/total': 13504000, 'tokens/trainable': 261491, 'epoch': '1.758'}
88%|████████▊ | 422/478 [1:52:49<13:57, 14.95s/it] 88%|████████▊ | 423/478 [1:53:04<13:42, 14.95s/it] {'loss': '0.07715', 'grad_norm': '3.297', 'learning_rate': '8.216e-07', 'ppl': '1.08', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '19.03', 'tokens/total': 13536000, 'tokens/trainable': 262059, 'epoch': '1.762'}
88%|████████▊ | 423/478 [1:53:04<13:42, 14.95s/it] 89%|████████▊ | 424/478 [1:53:19<13:27, 14.95s/it] {'loss': '0.1002', 'grad_norm': '4.531', 'learning_rate': '7.929e-07', 'ppl': '1.105', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.1', 'tokens/total': 13568000, 'tokens/trainable': 262689, 'epoch': '1.767'}
89%|████████▊ | 424/478 [1:53:19<13:27, 14.95s/it] 89%|████████▉ | 425/478 [1:53:34<13:12, 14.95s/it] {'loss': '0.08423', 'grad_norm': '4.406', 'learning_rate': '7.647e-07', 'ppl': '1.088', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '17.76', 'tokens/total': 13600000, 'tokens/trainable': 263219, 'epoch': '1.771'}
89%|████████▉ | 425/478 [1:53:34<13:12, 14.95s/it] 89%|████████▉ | 426/478 [1:53:49<12:57, 14.95s/it] {'loss': '0.1345', 'grad_norm': '6.062', 'learning_rate': '7.37e-07', 'ppl': '1.144', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '22.27', 'tokens/total': 13632000, 'tokens/trainable': 263884, 'epoch': '1.775'}
89%|████████▉ | 426/478 [1:53:49<12:57, 14.95s/it] 89%|████████▉ | 427/478 [1:54:04<12:42, 14.95s/it] {'loss': '0.1025', 'grad_norm': '4.938', 'learning_rate': '7.098e-07', 'ppl': '1.108', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '20.23', 'tokens/total': 13664000, 'tokens/trainable': 264488, 'epoch': '1.779'}
89%|████████▉ | 427/478 [1:54:04<12:42, 14.95s/it] 90%|████████▉ | 428/478 [1:54:18<12:27, 14.95s/it] {'loss': '0.07617', 'grad_norm': '3.781', 'learning_rate': '6.83e-07', 'ppl': '1.079', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.44', 'tokens/total': 13696000, 'tokens/trainable': 265128, 'epoch': '1.783'}
90%|████████▉ | 428/478 [1:54:18<12:27, 14.95s/it] 90%|████████▉ | 429/478 [1:54:33<12:12, 14.95s/it] {'loss': '0.1064', 'grad_norm': '4.469', 'learning_rate': '6.568e-07', 'ppl': '1.112', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21', 'tokens/total': 13728000, 'tokens/trainable': 265755, 'epoch': '1.788'}
90%|████████▉ | 429/478 [1:54:33<12:12, 14.95s/it] 90%|████████▉ | 430/478 [1:54:48<11:57, 14.95s/it] {'loss': '0.09375', 'grad_norm': '4.375', 'learning_rate': '6.311e-07', 'ppl': '1.098', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.77', 'tokens/total': 13760000, 'tokens/trainable': 266405, 'epoch': '1.792'}
90%|████████▉ | 430/478 [1:54:48<11:57, 14.95s/it] 90%|█████████ | 431/478 [1:55:03<11:42, 14.95s/it] {'loss': '0.1077', 'grad_norm': '4.219', 'learning_rate': '6.058e-07', 'ppl': '1.114', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '23.04', 'tokens/total': 13792000, 'tokens/trainable': 267093, 'epoch': '1.796'}
90%|█████████ | 431/478 [1:55:03<11:42, 14.95s/it] 90%|█████████ | 432/478 [1:55:18<11:27, 14.95s/it] {'loss': '0.1057', 'grad_norm': '4.594', 'learning_rate': '5.811e-07', 'ppl': '1.111', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.54', 'tokens/total': 13824000, 'tokens/trainable': 267736, 'epoch': '1.8'}
90%|█████████ | 432/478 [1:55:18<11:27, 14.95s/it] 91%|█████████ | 433/478 [1:55:33<11:12, 14.95s/it] {'loss': '0.1113', 'grad_norm': '5.094', 'learning_rate': '5.569e-07', 'ppl': '1.118', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '22.17', 'tokens/total': 13856000, 'tokens/trainable': 268398, 'epoch': '1.804'}
91%|█████████ | 433/478 [1:55:33<11:12, 14.95s/it] 91%|█████████ | 434/478 [1:55:48<10:56, 14.93s/it] {'loss': '0.1201', 'grad_norm': '4.969', 'learning_rate': '5.331e-07', 'ppl': '1.128', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '22.31', 'tokens/total': 13888000, 'tokens/trainable': 269061, 'epoch': '1.808'}
91%|█████████ | 434/478 [1:55:48<10:56, 14.93s/it] 91%|█████████ | 435/478 [1:56:03<10:42, 14.93s/it] {'loss': '0.104', 'grad_norm': '4.406', 'learning_rate': '5.099e-07', 'ppl': '1.11', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '22.69', 'tokens/total': 13920000, 'tokens/trainable': 269738, 'epoch': '1.812'}
91%|█████████ | 435/478 [1:56:03<10:42, 14.93s/it] 91%|█████████ | 436/478 [1:56:18<10:27, 14.93s/it] {'loss': '0.1233', 'grad_norm': '4.281', 'learning_rate': '4.872e-07', 'ppl': '1.131', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '22.51', 'tokens/total': 13952000, 'tokens/trainable': 270409, 'epoch': '1.817'}
91%|█████████ | 436/478 [1:56:18<10:27, 14.93s/it] 91%|█████████▏| 437/478 [1:56:33<10:12, 14.93s/it] {'loss': '0.1187', 'grad_norm': '5.219', 'learning_rate': '4.65e-07', 'ppl': '1.126', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '19.09', 'tokens/total': 13984000, 'tokens/trainable': 270979, 'epoch': '1.821'}
91%|█████████▏| 437/478 [1:56:33<10:12, 14.93s/it] 92%|█████████▏| 438/478 [1:56:48<09:57, 14.94s/it] {'loss': '0.1045', 'grad_norm': '5', 'learning_rate': '4.432e-07', 'ppl': '1.11', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.7', 'tokens/total': 14016000, 'tokens/trainable': 271627, 'epoch': '1.825'}
92%|█████████▏| 438/478 [1:56:48<09:57, 14.94s/it] 92%|█████████▏| 439/478 [1:57:03<09:44, 15.00s/it] {'loss': '0.1008', 'grad_norm': '5', 'learning_rate': '4.22e-07', 'ppl': '1.106', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '19.12', 'tokens/total': 14048000, 'tokens/trainable': 272205, 'epoch': '1.829'}
92%|█████████▏| 439/478 [1:57:03<09:44, 15.00s/it] 92%|█████████▏| 440/478 [1:57:18<09:29, 14.98s/it] {'loss': '0.1465', 'grad_norm': '5.156', 'learning_rate': '4.013e-07', 'ppl': '1.158', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '20.87', 'tokens/total': 14080000, 'tokens/trainable': 272828, 'epoch': '1.833'}
92%|█████████▏| 440/478 [1:57:18<09:29, 14.98s/it] 92%|█████████▏| 441/478 [1:57:33<09:13, 14.97s/it] {'loss': '0.1189', 'grad_norm': '4.219', 'learning_rate': '3.812e-07', 'ppl': '1.126', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '20.87', 'tokens/total': 14112000, 'tokens/trainable': 273451, 'epoch': '1.837'}
92%|█████████▏| 441/478 [1:57:33<09:13, 14.97s/it] 92%|█████████▏| 442/478 [1:57:48<08:58, 14.97s/it] {'loss': '0.09351', 'grad_norm': '4.188', 'learning_rate': '3.615e-07', 'ppl': '1.098', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '18.96', 'tokens/total': 14144000, 'tokens/trainable': 274017, 'epoch': '1.842'}
92%|█████████▏| 442/478 [1:57:48<08:58, 14.97s/it] 93%|█████████▎| 443/478 [1:58:03<08:43, 14.96s/it] {'loss': '0.1255', 'grad_norm': '4.656', 'learning_rate': '3.423e-07', 'ppl': '1.134', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.1', 'tokens/total': 14176000, 'tokens/trainable': 274647, 'epoch': '1.846'}
93%|█████████▎| 443/478 [1:58:03<08:43, 14.96s/it] 93%|█████████▎| 444/478 [1:58:18<08:28, 14.96s/it] {'loss': '0.1572', 'grad_norm': '5.25', 'learning_rate': '3.237e-07', 'ppl': '1.17', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '23.21', 'tokens/total': 14208000, 'tokens/trainable': 275340, 'epoch': '1.85'}
93%|█████████▎| 444/478 [1:58:18<08:28, 14.96s/it] 93%|█████████▎| 445/478 [1:58:33<08:13, 14.95s/it] {'loss': '0.0918', 'grad_norm': '3.75', 'learning_rate': '3.055e-07', 'ppl': '1.096', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.47', 'tokens/total': 14240000, 'tokens/trainable': 275980, 'epoch': '1.854'}
93%|█████████▎| 445/478 [1:58:33<08:13, 14.95s/it] 93%|█████████▎| 446/478 [1:58:48<07:58, 14.95s/it] {'loss': '0.07227', 'grad_norm': '3.5', 'learning_rate': '2.879e-07', 'ppl': '1.075', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.79', 'tokens/total': 14272000, 'tokens/trainable': 276631, 'epoch': '1.858'}
93%|█████████▎| 446/478 [1:58:48<07:58, 14.95s/it] 94%|█████████▎| 447/478 [1:59:03<07:43, 14.95s/it] {'loss': '0.137', 'grad_norm': '4.969', 'learning_rate': '2.708e-07', 'ppl': '1.147', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '23.74', 'tokens/total': 14304000, 'tokens/trainable': 277340, 'epoch': '1.863'}
94%|█████████▎| 447/478 [1:59:03<07:43, 14.95s/it] 94%|█████████▎| 448/478 [1:59:17<07:28, 14.95s/it] {'loss': '0.1152', 'grad_norm': '4.75', 'learning_rate': '2.542e-07', 'ppl': '1.122', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '19.96', 'tokens/total': 14336000, 'tokens/trainable': 277936, 'epoch': '1.867'}
94%|█████████▎| 448/478 [1:59:17<07:28, 14.95s/it] 94%|█████████▍| 449/478 [1:59:32<07:12, 14.93s/it] {'loss': '0.167', 'grad_norm': '5.812', 'learning_rate': '2.381e-07', 'ppl': '1.182', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '23.15', 'tokens/total': 14368000, 'tokens/trainable': 278624, 'epoch': '1.871'}
94%|█████████▍| 449/478 [1:59:32<07:12, 14.93s/it] 94%|█████████▍| 450/478 [1:59:47<06:58, 14.93s/it] {'loss': '0.09229', 'grad_norm': '4.312', 'learning_rate': '2.226e-07', 'ppl': '1.097', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.07', 'tokens/total': 14400000, 'tokens/trainable': 279253, 'epoch': '1.875'}
94%|█████████▍| 450/478 [1:59:47<06:58, 14.93s/it] 94%|█████████▍| 451/478 [2:00:02<06:43, 14.93s/it] {'loss': '0.0874', 'grad_norm': '3.5', 'learning_rate': '2.076e-07', 'ppl': '1.091', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '22.39', 'tokens/total': 14432000, 'tokens/trainable': 279921, 'epoch': '1.879'}
94%|█████████▍| 451/478 [2:00:02<06:43, 14.93s/it] 95%|█████████▍| 452/478 [2:00:17<06:27, 14.92s/it] {'loss': '0.1311', 'grad_norm': '4.469', 'learning_rate': '1.93e-07', 'ppl': '1.14', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.63', 'tokens/total': 14464000, 'tokens/trainable': 280564, 'epoch': '1.883'}
95%|█████████▍| 452/478 [2:00:17<06:27, 14.92s/it] 95%|█████████▍| 453/478 [2:00:32<06:13, 14.93s/it] {'loss': '0.1213', 'grad_norm': '4.562', 'learning_rate': '1.79e-07', 'ppl': '1.129', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '22.21', 'tokens/total': 14496000, 'tokens/trainable': 281227, 'epoch': '1.887'}
95%|█████████▍| 453/478 [2:00:32<06:13, 14.93s/it] 95%|█████████▍| 454/478 [2:00:47<05:58, 14.94s/it] {'loss': '0.1208', 'grad_norm': '5.375', 'learning_rate': '1.656e-07', 'ppl': '1.128', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '16.85', 'tokens/total': 14528000, 'tokens/trainable': 281731, 'epoch': '1.892'}
95%|█████████▍| 454/478 [2:00:47<05:58, 14.94s/it] 95%|█████████▌| 455/478 [2:01:02<05:43, 14.94s/it] {'loss': '0.1101', 'grad_norm': '4.938', 'learning_rate': '1.526e-07', 'ppl': '1.116', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '20.3', 'tokens/total': 14560000, 'tokens/trainable': 282337, 'epoch': '1.896'}
95%|█████████▌| 455/478 [2:01:02<05:43, 14.94s/it] 95%|█████████▌| 456/478 [2:01:17<05:28, 14.94s/it] {'loss': '0.1323', 'grad_norm': '4.969', 'learning_rate': '1.402e-07', 'ppl': '1.141', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.47', 'tokens/total': 14592000, 'tokens/trainable': 282978, 'epoch': '1.9'}
95%|█████████▌| 456/478 [2:01:17<05:28, 14.94s/it] 96%|█████████▌| 457/478 [2:01:32<05:13, 14.94s/it] {'loss': '0.1028', 'grad_norm': '4.594', 'learning_rate': '1.283e-07', 'ppl': '1.108', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.3', 'tokens/total': 14624000, 'tokens/trainable': 283614, 'epoch': '1.904'}
96%|█████████▌| 457/478 [2:01:32<05:13, 14.94s/it] 96%|█████████▌| 458/478 [2:01:47<04:58, 14.94s/it] {'loss': '0.1025', 'grad_norm': '4.156', 'learning_rate': '1.169e-07', 'ppl': '1.108', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '22.45', 'tokens/total': 14656000, 'tokens/trainable': 284283, 'epoch': '1.908'}
96%|█████████▌| 458/478 [2:01:47<04:58, 14.94s/it] 96%|█████████▌| 459/478 [2:02:02<04:43, 14.94s/it] {'loss': '0.0918', 'grad_norm': '4.281', 'learning_rate': '1.061e-07', 'ppl': '1.096', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '22.04', 'tokens/total': 14688000, 'tokens/trainable': 284941, 'epoch': '1.913'}
96%|█████████▌| 459/478 [2:02:02<04:43, 14.94s/it] 96%|█████████▌| 460/478 [2:02:17<04:28, 14.94s/it] {'loss': '0.08789', 'grad_norm': '4.656', 'learning_rate': '9.575e-08', 'ppl': '1.092', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '22.68', 'tokens/total': 14720000, 'tokens/trainable': 285618, 'epoch': '1.917'}
96%|█████████▌| 460/478 [2:02:17<04:28, 14.94s/it] 96%|█████████▋| 461/478 [2:02:32<04:13, 14.93s/it] {'loss': '0.1421', 'grad_norm': '5.469', 'learning_rate': '8.595e-08', 'ppl': '1.153', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '23.9', 'tokens/total': 14752000, 'tokens/trainable': 286330, 'epoch': '1.921'}
96%|█████████▋| 461/478 [2:02:32<04:13, 14.93s/it] 97%|█████████▋| 462/478 [2:02:47<03:58, 14.94s/it] {'loss': '0.1292', 'grad_norm': '4.75', 'learning_rate': '7.668e-08', 'ppl': '1.138', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '20.47', 'tokens/total': 14784000, 'tokens/trainable': 286941, 'epoch': '1.925'}
97%|█████████▋| 462/478 [2:02:47<03:58, 14.94s/it] 97%|█████████▋| 463/478 [2:03:02<03:46, 15.13s/it] {'loss': '0.1533', 'grad_norm': '5.406', 'learning_rate': '6.793e-08', 'ppl': '1.166', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.2', 'tokens/total': 14816000, 'tokens/trainable': 287574, 'epoch': '1.929'}
97%|█████████▋| 463/478 [2:03:02<03:46, 15.13s/it] 97%|█████████▋| 464/478 [2:03:17<03:31, 15.08s/it] {'loss': '0.09521', 'grad_norm': '3.891', 'learning_rate': '5.971e-08', 'ppl': '1.1', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '20.9', 'tokens/total': 14848000, 'tokens/trainable': 288198, 'epoch': '1.933'}
97%|█████████▋| 464/478 [2:03:17<03:31, 15.08s/it] 97%|█████████▋| 465/478 [2:03:32<03:15, 15.04s/it] {'loss': '0.1111', 'grad_norm': '4.344', 'learning_rate': '5.202e-08', 'ppl': '1.117', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.87', 'tokens/total': 14880000, 'tokens/trainable': 288851, 'epoch': '1.938'}
97%|█████████▋| 465/478 [2:03:32<03:15, 15.04s/it] 97%|█████████▋| 466/478 [2:03:47<03:00, 15.07s/it] {'loss': '0.1223', 'grad_norm': '5.156', 'learning_rate': '4.486e-08', 'ppl': '1.13', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '19.42', 'tokens/total': 14912000, 'tokens/trainable': 289439, 'epoch': '1.942'}
97%|█████████▋| 466/478 [2:03:47<03:00, 15.07s/it] 98%|█████████▊| 467/478 [2:04:02<02:45, 15.04s/it] {'loss': '0.1265', 'grad_norm': '4.75', 'learning_rate': '3.823e-08', 'ppl': '1.135', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '22.47', 'tokens/total': 14944000, 'tokens/trainable': 290110, 'epoch': '1.946'}
98%|█████████▊| 467/478 [2:04:02<02:45, 15.04s/it] 98%|█████████▊| 468/478 [2:04:17<02:30, 15.01s/it] {'loss': '0.1294', 'grad_norm': '5.344', 'learning_rate': '3.213e-08', 'ppl': '1.138', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.03', 'tokens/total': 14976000, 'tokens/trainable': 290738, 'epoch': '1.95'}
98%|█████████▊| 468/478 [2:04:17<02:30, 15.01s/it] 98%|█████████▊| 469/478 [2:04:32<02:14, 14.99s/it] {'loss': '0.1489', 'grad_norm': '5.562', 'learning_rate': '2.655e-08', 'ppl': '1.161', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '17.81', 'tokens/total': 15008000, 'tokens/trainable': 291270, 'epoch': '1.954'}
98%|█████████▊| 469/478 [2:04:32<02:14, 14.99s/it] 98%|█████████▊| 470/478 [2:04:47<01:59, 14.98s/it] {'loss': '0.1387', 'grad_norm': '5.312', 'learning_rate': '2.151e-08', 'ppl': '1.149', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '20.5', 'tokens/total': 15040000, 'tokens/trainable': 291882, 'epoch': '1.958'}
98%|█████████▊| 470/478 [2:04:47<01:59, 14.98s/it] 99%|█████████▊| 471/478 [2:05:02<01:44, 14.97s/it] {'loss': '0.1108', 'grad_norm': '4.594', 'learning_rate': '1.7e-08', 'ppl': '1.117', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.23', 'tokens/total': 15072000, 'tokens/trainable': 292516, 'epoch': '1.962'}
99%|█████████▊| 471/478 [2:05:02<01:44, 14.97s/it] 99%|█████████▊| 472/478 [2:05:17<01:29, 14.96s/it] {'loss': '0.1416', 'grad_norm': '5.562', 'learning_rate': '1.301e-08', 'ppl': '1.152', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '17.82', 'tokens/total': 15104000, 'tokens/trainable': 293048, 'epoch': '1.967'}
99%|█████████▊| 472/478 [2:05:17<01:29, 14.96s/it] 99%|█████████▉| 473/478 [2:05:32<01:14, 14.96s/it] {'loss': '0.1084', 'grad_norm': '4.75', 'learning_rate': '9.562e-09', 'ppl': '1.114', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '19.33', 'tokens/total': 15136000, 'tokens/trainable': 293625, 'epoch': '1.971'}
99%|█████████▉| 473/478 [2:05:32<01:14, 14.96s/it] 99%|█████████▉| 474/478 [2:05:47<00:59, 14.95s/it] {'loss': '0.1045', 'grad_norm': '4.344', 'learning_rate': '6.641e-09', 'ppl': '1.11', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '23.11', 'tokens/total': 15168000, 'tokens/trainable': 294314, 'epoch': '1.975'}
99%|█████████▉| 474/478 [2:05:47<00:59, 14.95s/it] 99%|█████████▉| 475/478 [2:06:02<00:44, 14.95s/it] {'loss': '0.1055', 'grad_norm': '4.812', 'learning_rate': '4.25e-09', 'ppl': '1.111', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '20.31', 'tokens/total': 15200000, 'tokens/trainable': 294920, 'epoch': '1.979'}
99%|█████████▉| 475/478 [2:06:02<00:44, 14.95s/it] 100%|█████████▉| 476/478 [2:06:17<00:29, 14.95s/it] {'loss': '0.1462', 'grad_norm': '5.719', 'learning_rate': '2.391e-09', 'ppl': '1.157', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '22.54', 'tokens/total': 15232000, 'tokens/trainable': 295593, 'epoch': '1.983'}
100%|█████████▉| 476/478 [2:06:17<00:29, 14.95s/it] 100%|█████████▉| 477/478 [2:06:32<00:14, 14.95s/it] {'loss': '0.1807', 'grad_norm': '6.406', 'learning_rate': '1.063e-09', 'ppl': '1.198', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '21.9', 'tokens/total': 15264000, 'tokens/trainable': 296247, 'epoch': '1.988'}
100%|█████████▉| 477/478 [2:06:32<00:14, 14.95s/it] 100%|██████████| 478/478 [2:06:47<00:00, 14.95s/it] {'loss': '0.1331', 'grad_norm': '5.906', 'learning_rate': '2.657e-10', 'ppl': '1.142', 'memory/max_active (GiB)': '45.41', 'memory/max_allocated (GiB)': '45.41', 'memory/device_reserved (GiB)': '57.25', 'tokens/train_per_sec_per_gpu': '19.63', 'tokens/total': 15296000, 'tokens/trainable': 296833, 'epoch': '1.992'}
100%|██████████| 478/478 [2:06:47<00:00, 14.95s/it][2026-04-17 04:16:11,186] [INFO] [axolotl.core.trainers.base] Running evaluation step...
[2026-04-17 04:16:19,065] [INFO] [axolotl.utils.samplers.multipack] gather_len_batches: [54, 54]
0%| | 0/27 [00:00<?, ?it/s]
7%|▋ | 2/27 [00:02<00:35, 1.41s/it]
11%|█ | 3/27 [00:05<00:47, 1.96s/it]
15%|█▍ | 4/27 [00:08<00:51, 2.26s/it]
19%|█▊ | 5/27 [00:11<00:53, 2.43s/it]
22%|██▏ | 6/27 [00:13<00:53, 2.54s/it]
26%|██▌ | 7/27 [00:16<00:52, 2.61s/it]
30%|██▉ | 8/27 [00:19<00:50, 2.65s/it]
33%|███▎ | 9/27 [00:22<00:48, 2.68s/it]
37%|███▋ | 10/27 [00:24<00:45, 2.70s/it]
41%|████ | 11/27 [00:27<00:43, 2.72s/it]
44%|████▍ | 12/27 [00:30<00:39, 2.64s/it]
48%|████▊ | 13/27 [00:33<00:38, 2.76s/it]
52%|█████▏ | 14/27 [00:35<00:35, 2.76s/it]
56%|█████▌ | 15/27 [00:38<00:33, 2.76s/it]
59%|█████▉ | 16/27 [00:41<00:30, 2.76s/it]
63%|██████▎ | 17/27 [00:44<00:27, 2.75s/it]
67%|██████▋ | 18/27 [00:46<00:24, 2.75s/it]
70%|███████ | 19/27 [00:49<00:22, 2.75s/it]
74%|███████▍ | 20/27 [00:52<00:19, 2.75s/it]
78%|███████▊ | 21/27 [00:54<00:16, 2.67s/it]
81%|████████▏ | 22/27 [00:57<00:13, 2.78s/it]
85%|████████▌ | 23/27 [01:00<00:11, 2.77s/it]
89%|████████▉ | 24/27 [01:03<00:08, 2.77s/it]
93%|█████████▎| 25/27 [01:06<00:05, 2.76s/it]
96%|█████████▋| 26/27 [01:08<00:02, 2.76s/it]
100%|██████████| 27/27 [01:11<00:00, 2.79s/it]
{'eval_loss': '0.2227', 'eval_runtime': '75.1', 'eval_samples_per_second': '2.783', 'eval_steps_per_second': '1.398', 'eval_ppl': '1.249', 'memory/max_active (GiB)': '34.91', 'memory/max_allocated (GiB)': '34.91', 'memory/device_reserved (GiB)': '57.25', 'epoch': '1.992', 'tokens/train_per_sec_per_gpu': '0'}
100%|██████████| 478/478 [2:08:10<00:00, 14.95s/it]
100%|██████████| 27/27 [01:13<00:00, 2.79s/it]
[2026-04-17 04:17:40,668] [INFO] [axolotl.core.trainers.base] Saving model checkpoint to /workspace/data/outputs/qwen3-4B/fft_magnifi-module-classifier-04-17-relabelled-upsampled/checkpoint-478
Writing model shards: 0%| | 0/1 [00:00<?, ?it/s]
Writing model shards: 100%|██████████| 1/1 [00:20<00:00, 20.91s/it] Writing model shards: 100%|██████████| 1/1 [00:20<00:00, 20.91s/it]
{'train_runtime': '7816', 'train_samples_per_second': '0.122', 'train_steps_per_second': '0.061', 'train_loss': '0.1648', 'memory/max_active (GiB)': '12.01', 'memory/max_allocated (GiB)': '12.01', 'memory/device_reserved (GiB)': '12.11', 'epoch': '1.992', 'tokens/train_per_sec_per_gpu': '0'}
100%|██████████| 478/478 [2:10:13<00:00, 14.95s/it] 100%|██████████| 478/478 [2:10:13<00:00, 16.35s/it]
[2026-04-17 04:20:34,131] [INFO] [axolotl.train] Training completed! Saving trained model to /workspace/data/outputs/qwen3-4B/fft_magnifi-module-classifier-04-17-relabelled-upsampled/.
[2026-04-17 04:20:39,898] [INFO] [axolotl.core.trainers.base] Saving model checkpoint to /workspace/data/outputs/qwen3-4B/fft_magnifi-module-classifier-04-17-relabelled-upsampled/
Writing model shards: 0%| | 0/1 [00:00<?, ?it/s] Writing model shards: 100%|██████████| 1/1 [00:22<00:00, 22.07s/it] Writing model shards: 100%|██████████| 1/1 [00:22<00:00, 22.07s/it]
[2026-04-17 04:21:08,433] [INFO] [axolotl.core.trainers.base] Saving model checkpoint to /workspace/data/outputs/qwen3-4B/fft_magnifi-module-classifier-04-17-relabelled-upsampled/
Writing model shards: 0%| | 0/1 [00:00<?, ?it/s] Writing model shards: 100%|██████████| 1/1 [00:20<00:00, 20.34s/it] Writing model shards: 100%|██████████| 1/1 [00:20<00:00, 20.35s/it]
Processing Files (0 / 0) : | | 0.00B / 0.00B
New Data Upload : | | 0.00B / 0.00B
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 2%|▏ | 184MB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 2%|▏ | 184MB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 2%|▏ | 195MB / 8.83GB, ???B/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 4%|▍ | 336MB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 4%|▍ | 347MB / 8.83GB, 758MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 5%|▌ | 464MB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 5%|▌ | 475MB / 8.83GB, 701MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 7%|▋ | 576MB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 7%|▋ | 587MB / 8.83GB, 653MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 8%|▊ | 680MB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 8%|▊ | 691MB / 8.83GB, 620MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 10%|█ | 912MB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 11%|█ | 976MB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 13%|█▎ | 1.12GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 13%|█▎ | 1.13GB / 8.83GB, 668MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 14%|█▍ | 1.22GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 14%|█▍ | 1.23GB / 8.83GB, 645MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 15%|█▍ | 1.30GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 15%|█▍ | 1.32GB / 8.83GB, 622MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 16%|█▌ | 1.40GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 16%|█▌ | 1.41GB / 8.83GB, 608MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 17%|█▋ | 1.51GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 17%|█▋ | 1.52GB / 8.83GB, 604MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 18%|█▊ | 1.61GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 18%|█▊ | 1.62GB / 8.83GB, 593MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 19%|█▉ | 1.70GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 19%|█▉ | 1.72GB / 8.83GB, 585MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 21%|██ | 1.82GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 21%|██ | 1.83GB / 8.83GB, 583MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 22%|██▏ | 1.92GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 22%|██▏ | 1.93GB / 8.83GB, 579MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 23%|██▎ | 2.02GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 23%|██▎ | 2.04GB / 8.83GB, 575MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 24%|██▍ | 2.11GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 24%|██▍ | 2.12GB / 8.83GB, 567MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 25%|██▌ | 2.21GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 25%|██▌ | 2.22GB / 8.83GB, 562MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 26%|██▌ | 2.30GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 26%|██▌ | 2.32GB / 8.83GB, 558MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 27%|██▋ | 2.41GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 27%|██▋ | 2.42GB / 8.83GB, 556MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 28%|██▊ | 2.50GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 28%|██▊ | 2.52GB / 8.83GB, 552MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 30%|██▉ | 2.62GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 30%|██▉ | 2.63GB / 8.83GB, 553MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 31%|███ | 2.71GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 31%|███ | 2.72GB / 8.83GB, 550MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 32%|███▏ | 2.82GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 32%|███▏ | 2.83GB / 8.83GB, 548MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 33%|███▎ | 2.91GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 33%|███▎ | 2.92GB / 8.83GB, 546MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 34%|███▍ | 3.01GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 34%|███▍ | 3.02GB / 8.83GB, 543MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 35%|███▌ | 3.11GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 35%|███▌ | 3.12GB / 8.83GB, 542MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 37%|███▋ | 3.22GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 37%|███▋ | 3.24GB / 8.83GB, 543MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 38%|███▊ | 3.33GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 38%|███▊ | 3.34GB / 8.83GB, 542MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 39%|███▉ | 3.43GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 39%|███▉ | 3.44GB / 8.83GB, 541MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 40%|████ | 3.54GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 40%|████ | 3.56GB / 8.83GB, 542MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 41%|████▏ | 3.65GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 41%|████▏ | 3.66GB / 8.83GB, 541MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 43%|████▎ | 3.76GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 43%|████▎ | 3.77GB / 8.83GB, 542MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 44%|████▍ | 3.86GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 44%|████▍ | 3.88GB / 8.83GB, 541MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 45%|████▌ | 3.98GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 45%|████▌ | 4.00GB / 8.83GB, 543MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 47%|████▋ | 4.10GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 47%|████▋ | 4.12GB / 8.83GB, 544MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 48%|████▊ | 4.22GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 48%|████▊ | 4.23GB / 8.83GB, 545MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 49%|████▉ | 4.34GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 49%|████▉ | 4.36GB / 8.83GB, 547MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 51%|█████ | 4.46GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 51%|█████ | 4.48GB / 8.83GB, 549MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 52%|█████▏ | 4.58GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 52%|█████▏ | 4.59GB / 8.83GB, 549MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 53%|█████▎ | 4.70GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 53%|█████▎ | 4.71GB / 8.83GB, 550MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 54%|█████▍ | 4.81GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 55%|█████▍ | 4.82GB / 8.83GB, 550MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 56%|█████▌ | 4.93GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 56%|█████▌ | 4.94GB / 8.83GB, 552MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 57%|█████▋ | 5.05GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 57%|█████▋ | 5.06GB / 8.83GB, 553MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 58%|█████▊ | 5.16GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 59%|█████▊ | 5.17GB / 8.83GB, 553MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 60%|█████▉ | 5.28GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 60%|█████▉ | 5.29GB / 8.83GB, 554MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 61%|██████ | 5.39GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 61%|██████ | 5.40GB / 8.83GB, 554MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 62%|██████▏ | 5.51GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 63%|██████▎ | 5.52GB / 8.83GB, 555MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 64%|██████▎ | 5.62GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 64%|██████▍ | 5.64GB / 8.83GB, 555MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 65%|██████▌ | 5.74GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 65%|██████▌ | 5.76GB / 8.83GB, 556MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 66%|██████▋ | 5.86GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 67%|██████▋ | 5.88GB / 8.83GB, 557MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 68%|██████▊ | 5.98GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 68%|██████▊ | 5.99GB / 8.83GB, 553MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 69%|██████▉ | 6.10GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 69%|██████▉ | 6.11GB / 8.83GB, 552MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 70%|███████ | 6.22GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 70%|███████ | 6.23GB / 8.83GB, 553MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 72%|███████▏ | 6.34GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 72%|███████▏ | 6.35GB / 8.83GB, 554MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 73%|███████▎ | 6.46GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 73%|███████▎ | 6.47GB / 8.83GB, 550MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 75%|███████▍ | 6.58GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 75%|███████▍ | 6.59GB / 8.83GB, 549MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 76%|███████▌ | 6.70GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 76%|███████▌ | 6.71GB / 8.83GB, 547MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 77%|███████▋ | 6.81GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 77%|███████▋ | 6.82GB / 8.83GB, 548MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 79%|███████▊ | 6.93GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 79%|███████▊ | 6.94GB / 8.83GB, 551MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 80%|███████▉ | 7.05GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 80%|███████▉ | 7.06GB / 8.83GB, 554MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 81%|████████▏ | 7.18GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 81%|████████▏ | 7.19GB / 8.83GB, 555MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 83%|████████▎ | 7.29GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 83%|████████▎ | 7.30GB / 8.83GB, 557MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 84%|████████▍ | 7.42GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 84%|████████▍ | 7.43GB / 8.83GB, 560MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 85%|████████▌ | 7.53GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 85%|████████▌ | 7.54GB / 8.83GB, 560MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 87%|████████▋ | 7.65GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 87%|████████▋ | 7.66GB / 8.83GB, 562MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 88%|████████▊ | 7.77GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 88%|████████▊ | 7.78GB / 8.83GB, 563MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 89%|████████▉ | 7.89GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 89%|████████▉ | 7.90GB / 8.83GB, 566MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 91%|█████████ | 8.01GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 91%|█████████ | 8.02GB / 8.83GB, 569MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 92%|█████████▏| 8.13GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 92%|█████████▏| 8.14GB / 8.83GB, 571MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 93%|█████████▎| 8.24GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 93%|█████████▎| 8.25GB / 8.83GB, 572MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 95%|█████████▌| 8.38GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 95%|█████████▌| 8.40GB / 8.83GB, 576MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 97%|█████████▋| 8.52GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 97%|█████████▋| 8.53GB / 8.83GB, 579MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 98%|█████████▊| 8.63GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 98%|█████████▊| 8.64GB / 8.83GB, 580MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 99%|█████████▉| 8.75GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 99%|█████████▉| 8.76GB / 8.83GB, 582MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 100%|██████████| 8.82GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (3 / 3) : 100%|██████████| 8.83GB / 8.83GB, 580MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 100%|██████████| 8.82GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 100%|██████████| 8.82GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 100%|██████████| 8.82GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 100%|██████████| 8.82GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 100%|██████████| 8.82GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 100%|██████████| 8.82GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 100%|██████████| 8.82GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 100%|██████████| 8.82GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (3 / 3) : 100%|██████████| 8.83GB / 8.83GB, 506MB/s
New Data Upload : | | 0.00B / 0.00B, 0.00B/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 100%|██████████| 8.82GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB
[2026-04-17 04:21:54,742] [INFO] [axolotl.train] Model successfully saved to /workspace/data/outputs/qwen3-4B/fft_magnifi-module-classifier-04-17-relabelled-upsampled/
[2026-04-17 04:22:00,408] [INFO] [axolotl.core.trainers.base] Saving model checkpoint to /workspace/data/outputs/qwen3-4B/fft_magnifi-module-classifier-04-17-relabelled-upsampled/
Writing model shards: 0%| | 0/1 [00:00<?, ?it/s] Writing model shards: 100%|██████████| 1/1 [00:22<00:00, 22.09s/it] Writing model shards: 100%|██████████| 1/1 [00:22<00:00, 22.09s/it]
Processing Files (0 / 0) : | | 0.00B / 0.00B
New Data Upload : | | 0.00B / 0.00B
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 2%|▏ | 160MB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 2%|▏ | 160MB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 2%|▏ | 171MB / 8.83GB, ???B/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 3%|▎ | 248MB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 3%|▎ | 259MB / 8.83GB, 440MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 4%|▍ | 352MB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 4%|▍ | 363MB / 8.83GB, 480MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 5%|▌ | 464MB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 5%|▌ | 475MB / 8.83GB, 507MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 7%|▋ | 576MB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 7%|▋ | 587MB / 8.83GB, 520MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 8%|▊ | 680MB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 8%|▊ | 691MB / 8.83GB, 520MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 9%|▉ | 784MB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 9%|▉ | 795MB / 8.83GB, 520MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 10%|█ | 888MB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 10%|█ | 899MB / 8.83GB, 520MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 11%|█ | 984MB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 11%|█▏ | 995MB / 8.83GB, 515MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 12%|█▏ | 1.09GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 12%|█▏ | 1.10GB / 8.83GB, 515MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 13%|█▎ | 1.18GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 14%|█▎ | 1.20GB / 8.83GB, 512MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 15%|█▍ | 1.29GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 15%|█▍ | 1.30GB / 8.83GB, 513MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 16%|█▌ | 1.39GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 16%|█▌ | 1.40GB / 8.83GB, 513MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 17%|█▋ | 1.50GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 17%|█▋ | 1.51GB / 8.83GB, 514MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 18%|█▊ | 1.61GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 18%|█▊ | 1.62GB / 8.83GB, 517MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 19%|█▉ | 1.72GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 20%|█▉ | 1.73GB / 8.83GB, 520MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 21%|██ | 1.82GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 21%|██ | 1.83GB / 8.83GB, 517MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 22%|██▏ | 1.91GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 22%|██▏ | 1.92GB / 8.83GB, 515MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 23%|██▎ | 2.01GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 23%|██▎ | 2.02GB / 8.83GB, 513MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 24%|██▍ | 2.11GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 24%|██▍ | 2.12GB / 8.83GB, 514MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 25%|██▌ | 2.21GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 25%|██▌ | 2.22GB / 8.83GB, 512MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 26%|██▌ | 2.30GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 26%|██▌ | 2.32GB / 8.83GB, 511MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 27%|██▋ | 2.40GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 27%|██▋ | 2.41GB / 8.83GB, 509MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 28%|██▊ | 2.50GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 28%|██▊ | 2.52GB / 8.83GB, 510MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 29%|██▉ | 2.60GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 30%|██▉ | 2.61GB / 8.83GB, 508MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 31%|███ | 2.72GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 31%|███ | 2.73GB / 8.83GB, 512MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 32%|███▏ | 2.84GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 32%|███▏ | 2.85GB / 8.83GB, 515MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 33%|███▎ | 2.94GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 33%|███▎ | 2.96GB / 8.83GB, 516MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 35%|███▍ | 3.06GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 35%|███▍ | 3.07GB / 8.83GB, 517MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 36%|███▌ | 3.16GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 36%|███▌ | 3.17GB / 8.83GB, 517MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 37%|███▋ | 3.27GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 37%|███▋ | 3.28GB / 8.83GB, 519MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 38%|███▊ | 3.38GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 38%|███▊ | 3.39GB / 8.83GB, 519MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 40%|███▉ | 3.49GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 40%|███▉ | 3.50GB / 8.83GB, 520MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 41%|████ | 3.60GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 41%|████ | 3.61GB / 8.83GB, 521MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 42%|████▏ | 3.73GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 42%|████▏ | 3.74GB / 8.83GB, 525MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 44%|████▎ | 3.84GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 44%|████▎ | 3.85GB / 8.83GB, 526MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 45%|████▍ | 3.96GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 45%|████▍ | 3.97GB / 8.83GB, 528MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 46%|████▌ | 4.07GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 46%|████▌ | 4.08GB / 8.83GB, 529MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 48%|████▊ | 4.19GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 48%|████▊ | 4.20GB / 8.83GB, 531MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 49%|████▉ | 4.34GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 49%|████▉ | 4.36GB / 8.83GB, 536MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 51%|█████ | 4.46GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 51%|█████ | 4.47GB / 8.83GB, 537MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 52%|█████▏ | 4.58GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 52%|█████▏ | 4.59GB / 8.83GB, 539MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 53%|█████▎ | 4.69GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 53%|█████▎ | 4.70GB / 8.83GB, 539MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 55%|█████▍ | 4.82GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 55%|█████▍ | 4.83GB / 8.83GB, 541MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 56%|█████▌ | 4.94GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 56%|█████▌ | 4.95GB / 8.83GB, 543MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 57%|█████▋ | 5.06GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 57%|█████▋ | 5.07GB / 8.83GB, 544MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 59%|█████▊ | 5.17GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 59%|█████▊ | 5.18GB / 8.83GB, 544MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 60%|█████▉ | 5.29GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 60%|█████▉ | 5.30GB / 8.83GB, 546MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 61%|██████ | 5.40GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 61%|██████▏ | 5.41GB / 8.83GB, 546MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 63%|██████▎ | 5.54GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 63%|██████▎ | 5.55GB / 8.83GB, 549MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 64%|██████▍ | 5.66GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 64%|██████▍ | 5.68GB / 8.83GB, 550MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 66%|██████▌ | 5.78GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 66%|██████▌ | 5.80GB / 8.83GB, 551MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 67%|██████▋ | 5.90GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 67%|██████▋ | 5.92GB / 8.83GB, 555MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 68%|██████▊ | 6.02GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 68%|██████▊ | 6.03GB / 8.83GB, 555MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 70%|██████▉ | 6.14GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 70%|██████▉ | 6.15GB / 8.83GB, 556MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 71%|███████ | 6.26GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 71%|███████ | 6.27GB / 8.83GB, 557MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 72%|███████▏ | 6.38GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 72%|███████▏ | 6.39GB / 8.83GB, 558MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 74%|███████▎ | 6.49GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 74%|███████▎ | 6.50GB / 8.83GB, 559MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 75%|███████▍ | 6.61GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 75%|███████▍ | 6.62GB / 8.83GB, 561MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 76%|███████▌ | 6.72GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 76%|███████▌ | 6.73GB / 8.83GB, 562MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 78%|███████▊ | 6.84GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 78%|███████▊ | 6.85GB / 8.83GB, 564MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 79%|███████▉ | 6.95GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 79%|███████▉ | 6.96GB / 8.83GB, 566MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 80%|████████ | 7.06GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 80%|████████ | 7.08GB / 8.83GB, 566MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 82%|████████▏ | 7.19GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 82%|████████▏ | 7.20GB / 8.83GB, 569MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 90%|█████████ | 7.98GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 91%|█████████ | 7.99GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 91%|█████████ | 7.99GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 91%|█████████ | 8.00GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 92%|█████████▏| 8.12GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 92%|█████████▏| 8.13GB / 8.83GB, 609MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 94%|█████████▎| 8.26GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 94%|█████████▎| 8.27GB / 8.83GB, 613MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 95%|█████████▌| 8.41GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 95%|█████████▌| 8.42GB / 8.83GB, 617MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 97%|█████████▋| 8.53GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 97%|█████████▋| 8.54GB / 8.83GB, 620MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 98%|█████████▊| 8.64GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 98%|█████████▊| 8.65GB / 8.83GB, 621MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 99%|█████████▉| 8.76GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (2 / 3) : 99%|█████████▉| 8.77GB / 8.83GB, 624MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 100%|██████████| 8.82GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (3 / 3) : 100%|██████████| 8.83GB / 8.83GB, 620MB/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 100%|██████████| 8.82GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 100%|██████████| 8.82GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 100%|██████████| 8.82GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 100%|██████████| 8.82GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 100%|██████████| 8.82GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB Processing Files (3 / 3) : 100%|██████████| 8.83GB / 8.83GB, 577MB/s
New Data Upload : | | 0.00B / 0.00B, 0.00B/s
...sampled/training_args.bin: 100%|██████████| 12.0kB / 12.0kB
...sampled/model.safetensors: 100%|██████████| 8.82GB / 8.82GB
...-upsampled/tokenizer.json: 100%|██████████| 11.4MB / 11.4MB