SLURM_JOB_ID = 9198833 SLURM_JOB_NAME = nvr_elm_llm:dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume RUN_NAME = tw-data-train_final_replaced_from_classified-fix-format-8node-resume OUTPUT_DIR = runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume NNODES = 8 SLURM_JOB_ID = 9198833 SLURM_JOB_NAME = nvr_elm_llm:dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume RUN_NAME = tw-data-train_final_replaced_from_classified-fix-format-8node-resume OUTPUT_DIR = runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume NNODES = 8 SLURM_JOB_ID = 9198833 NODES = batch-block1-0086 batch-block1-3859 batch-block1-0069 batch-block1-0070 batch-block1-3534 batch-block1-0075 batch-block1-3833 batch-block1-3273 SLURM_JOB_NAME = nvr_elm_llm:dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume RUN_NAME = tw-data-train_final_replaced_from_classified-fix-format-8node-resume OUTPUT_DIR = runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume NODE_RANK = 3 GPUS_PER_NODE = 8 NNODES = 8 MASTER_ADDR = batch-block1-0086 MASTER_PORT = 25001 GLOBAL_TRAIN_BATCH_SIZE = GRADIENT_ACCUMULATION_STEPS = PER_DEVICE_TRAIN_BATCH_SIZE = NODES = batch-block1-0086 batch-block1-3859 batch-block1-0069 batch-block1-0070 batch-block1-3534 batch-block1-0075 batch-block1-3833 batch-block1-3273 NODE_RANK = 5 GPUS_PER_NODE = 8 NODES = batch-block1-0086 batch-block1-3859 batch-block1-0069 batch-block1-0070 batch-block1-3534 batch-block1-0075 batch-block1-3833 batch-block1-3273 NODE_RANK = 7 GPUS_PER_NODE = 8 Starting training from base model: Qwen/Qwen3-8B MASTER_ADDR = batch-block1-0086 MASTER_PORT = 25001 GLOBAL_TRAIN_BATCH_SIZE = GRADIENT_ACCUMULATION_STEPS = PER_DEVICE_TRAIN_BATCH_SIZE = MASTER_ADDR = batch-block1-0086 MASTER_PORT = 25001 GLOBAL_TRAIN_BATCH_SIZE = GRADIENT_ACCUMULATION_STEPS = PER_DEVICE_TRAIN_BATCH_SIZE = Starting training from base model: Qwen/Qwen3-8B Starting training from base model: Qwen/Qwen3-8B SLURM_JOB_ID = 9198833 SLURM_JOB_NAME = nvr_elm_llm:dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume RUN_NAME = tw-data-train_final_replaced_from_classified-fix-format-8node-resume OUTPUT_DIR = runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume NNODES = 8 NODES = batch-block1-0086 batch-block1-3859 batch-block1-0069 batch-block1-0070 batch-block1-3534 batch-block1-0075 batch-block1-3833 batch-block1-3273 NODE_RANK = 0 GPUS_PER_NODE = 8 MASTER_ADDR = batch-block1-0086 MASTER_PORT = 25001 GLOBAL_TRAIN_BATCH_SIZE = GRADIENT_ACCUMULATION_STEPS = PER_DEVICE_TRAIN_BATCH_SIZE = SLURM_JOB_ID = 9198833 SLURM_JOB_NAME = nvr_elm_llm:dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume RUN_NAME = tw-data-train_final_replaced_from_classified-fix-format-8node-resume OUTPUT_DIR = runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume NNODES = 8 Starting training from base model: Qwen/Qwen3-8B SLURM_JOB_ID = 9198833 SLURM_JOB_NAME = nvr_elm_llm:dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume RUN_NAME = tw-data-train_final_replaced_from_classified-fix-format-8node-resume OUTPUT_DIR = runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume NNODES = 8 NODES = batch-block1-0086 batch-block1-3859 batch-block1-0069 batch-block1-0070 batch-block1-3534 batch-block1-0075 batch-block1-3833 batch-block1-3273 NODE_RANK = 1 GPUS_PER_NODE = 8 MASTER_ADDR = batch-block1-0086 MASTER_PORT = 25001 GLOBAL_TRAIN_BATCH_SIZE = GRADIENT_ACCUMULATION_STEPS = PER_DEVICE_TRAIN_BATCH_SIZE = NODES = batch-block1-0086 batch-block1-3859 batch-block1-0069 batch-block1-0070 batch-block1-3534 batch-block1-0075 batch-block1-3833 batch-block1-3273 NODE_RANK = 6 GPUS_PER_NODE = 8 Starting training from base model: Qwen/Qwen3-8B MASTER_ADDR = batch-block1-0086 MASTER_PORT = 25001 GLOBAL_TRAIN_BATCH_SIZE = GRADIENT_ACCUMULATION_STEPS = PER_DEVICE_TRAIN_BATCH_SIZE = Starting training from base model: Qwen/Qwen3-8B SLURM_JOB_ID = 9198833 SLURM_JOB_NAME = nvr_elm_llm:dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume RUN_NAME = tw-data-train_final_replaced_from_classified-fix-format-8node-resume OUTPUT_DIR = runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume NNODES = 8 SLURM_JOB_ID = 9198833 SLURM_JOB_NAME = nvr_elm_llm:dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume RUN_NAME = tw-data-train_final_replaced_from_classified-fix-format-8node-resume OUTPUT_DIR = runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume NNODES = 8 NODES = batch-block1-0086 batch-block1-3859 batch-block1-0069 batch-block1-0070 batch-block1-3534 batch-block1-0075 batch-block1-3833 batch-block1-3273 NODE_RANK = 4 GPUS_PER_NODE = 8 NODES = batch-block1-0086 batch-block1-3859 batch-block1-0069 batch-block1-0070 batch-block1-3534 batch-block1-0075 batch-block1-3833 batch-block1-3273 NODE_RANK = 2 GPUS_PER_NODE = 8 MASTER_ADDR = batch-block1-0086 MASTER_PORT = 25001 GLOBAL_TRAIN_BATCH_SIZE = GRADIENT_ACCUMULATION_STEPS = PER_DEVICE_TRAIN_BATCH_SIZE = MASTER_ADDR = batch-block1-0086 MASTER_PORT = 25001 GLOBAL_TRAIN_BATCH_SIZE = GRADIENT_ACCUMULATION_STEPS = PER_DEVICE_TRAIN_BATCH_SIZE = Starting training from base model: Qwen/Qwen3-8B Starting training from base model: Qwen/Qwen3-8B Imported prefix tree collator v1 Imported prefix tree collator v1Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1Imported prefix tree collator v1Imported prefix tree collator v1 Imported prefix tree collator v1Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1Imported prefix tree collator v1 Imported prefix tree collator v1Imported prefix tree collator v1Imported prefix tree collator v1 Imported prefix tree collator v1Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1Imported prefix tree collator v1Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1Imported prefix tree collator v1 Imported prefix tree collator v1Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1Imported prefix tree collator v1Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1Imported prefix tree collator v1Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 [2026-04-15 11:24:22,855] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-15 11:24:22,855] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-15 11:24:22,857] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-15 11:24:22,857] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-15 11:24:22,855] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-15 11:24:22,855] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-15 11:24:22,855] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-15 11:24:22,855] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-15 11:24:22,855] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-15 11:24:22,855] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-15 11:24:22,855] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-15 11:24:22,855] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-15 11:24:22,855] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-15 11:24:22,855] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-15 11:24:22,855] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-15 11:24:22,855] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-15 11:24:22,857] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-15 11:24:22,857] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-15 11:24:22,857] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-15 11:24:22,857] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-15 11:24:22,857] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-15 11:24:22,855] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-15 11:24:22,856] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-15 11:24:22,857] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-15 11:24:22,859] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-15 11:24:22,859] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-15 11:24:22,859] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-15 11:24:22,859] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-15 11:24:22,859] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-15 11:24:22,859] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-15 11:24:22,859] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-15 11:24:22,859] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-15 11:24:22,869] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-15 11:24:22,869] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-15 11:24:22,869] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-15 11:24:22,869] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-15 11:24:22,869] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-15 11:24:22,869] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-15 11:24:22,869] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-15 11:24:22,869] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-15 11:24:23,093] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-15 11:24:23,093] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-15 11:24:23,093] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-15 11:24:23,093] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-15 11:24:23,093] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-15 11:24:23,093] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-15 11:24:23,093] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-15 11:24:23,094] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-15 11:24:23,194] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-15 11:24:23,194] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-15 11:24:23,194] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-15 11:24:23,194] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-15 11:24:23,194] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-15 11:24:23,194] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-15 11:24:23,194] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-15 11:24:23,194] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-15 11:24:23,634] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-15 11:24:23,634] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-15 11:24:23,634] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-15 11:24:23,634] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-15 11:24:23,634] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-15 11:24:23,634] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-15 11:24:23,634] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-15 11:24:23,634] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-15 11:24:34,714] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-15 11:24:34,714] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-15 11:24:34,714] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-15 11:24:34,714] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-15 11:24:34,714] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-15 11:24:34,714] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-15 11:24:34,714] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-15 11:24:34,714] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-15 11:24:34,716] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-15 11:24:34,714] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-15 11:24:34,714] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-15 11:24:34,714] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-15 11:24:34,714] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-15 11:24:34,714] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-15 11:24:34,714] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-15 11:24:34,714] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-15 11:24:34,714] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-15 11:24:34,714] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-15 11:24:34,714] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-15 11:24:34,714] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-15 11:24:34,714] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-15 11:24:34,714] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-15 11:24:34,716] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-15 11:24:34,716] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-15 11:24:34,716] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-15 11:24:34,716] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-15 11:24:34,716] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-15 11:24:34,716] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-15 11:24:34,715] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-15 11:24:34,714] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-15 11:24:34,715] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-15 11:24:34,716] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-15 11:24:35,013] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-15 11:24:35,013] [INFO] [comm.py:706:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl [2026-04-15 11:24:35,013] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-15 11:24:35,013] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-15 11:24:35,013] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-15 11:24:35,013] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-15 11:24:35,013] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-15 11:24:35,013] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-15 11:24:35,014] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-15 11:24:35,263] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-15 11:24:35,263] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-15 11:24:35,263] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-15 11:24:35,263] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-15 11:24:35,263] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-15 11:24:35,263] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-15 11:24:35,263] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-15 11:24:35,264] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-15 11:24:35,959] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-15 11:24:35,959] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-15 11:24:35,959] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-15 11:24:35,959] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-15 11:24:35,959] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-15 11:24:35,959] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-15 11:24:35,959] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-15 11:24:35,959] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-15 11:24:36,969] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-15 11:24:36,969] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-15 11:24:36,969] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-15 11:24:36,969] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-15 11:24:36,973] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-15 11:24:36,973] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-15 11:24:36,973] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-15 11:24:36,973] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-15 11:24:36,973] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-15 11:24:36,980] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-15 11:24:36,980] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-15 11:24:37,085] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-15 11:24:37,085] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-15 11:24:37,085] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-15 11:24:37,086] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-15 11:24:37,105] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-15 11:24:37,105] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-15 11:24:37,105] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-15 11:24:37,105] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-15 11:24:37,105] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-15 11:24:37,105] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-15 11:24:37,105] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-15 11:24:37,105] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-15 11:24:38,522] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-15 11:24:38,522] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-15 11:24:38,522] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-15 11:24:38,522] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-15 11:24:38,522] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-15 11:24:38,522] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-15 11:24:38,522] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-15 11:24:38,522] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-15 11:24:41,188] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-15 11:24:41,188] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-15 11:24:41,188] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-15 11:24:41,188] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-15 11:24:41,191] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-15 11:24:41,195] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-15 11:24:41,203] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-15 11:24:41,214] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-15 11:24:41,365] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-15 11:24:41,365] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-15 11:24:41,365] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-15 11:24:41,365] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-15 11:24:41,366] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-15 11:24:41,369] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-15 11:24:41,371] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-15 11:24:41,413] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-15 11:24:42,841] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-15 11:24:42,845] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-15 11:24:42,847] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-15 11:24:42,853] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-15 11:24:42,853] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-15 11:24:42,870] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-15 11:24:42,871] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-15 11:24:42,872] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-15 11:24:42,873] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-15 11:24:42,874] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-15 11:24:42,889] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-15 11:24:42,910] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-15 11:24:42,921] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-15 11:24:42,923] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-15 11:24:42,931] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-15 11:24:47,728] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-15 11:24:47,734] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-15 11:24:47,745] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-15 11:24:47,755] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-15 11:24:47,759] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-15 11:24:47,797] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-15 11:24:47,797] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-15 11:24:47,800] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-15 11:24:47,805] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-15 11:24:47,893] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-15 11:24:52,266] [INFO] [partition_parameters.py:348:__exit__] finished initializing model - num_params = 399, num_elems = 8.19B Using Qwen templates for Qwen/Qwen3-8B Using Qwen templates for Qwen/Qwen3-8B Using Qwen templates for Qwen/Qwen3-8B Using Qwen templates for Qwen/Qwen3-8B Using Qwen templates for Qwen/Qwen3-8B Using Qwen templates for Qwen/Qwen3-8B Using Qwen templates for Qwen/Qwen3-8B Using Qwen templates for Qwen/Qwen3-8B Using Qwen templates for Qwen/Qwen3-8B Using Qwen templates for Qwen/Qwen3-8B Using Qwen templates for Qwen/Qwen3-8B Using Qwen templates for Qwen/Qwen3-8B Using Qwen templates for Qwen/Qwen3-8B Using Qwen templates for Qwen/Qwen3-8B Using Qwen templates for Qwen/Qwen3-8B Using Qwen templates for Qwen/Qwen3-8BUsing Qwen templates for Qwen/Qwen3-8B Using Qwen templates for Qwen/Qwen3-8B Using Qwen templates for Qwen/Qwen3-8B Using Qwen templates for Qwen/Qwen3-8B Using Qwen templates for Qwen/Qwen3-8B Using Qwen templates for Qwen/Qwen3-8B Using Qwen templates for Qwen/Qwen3-8B Using Qwen templates for Qwen/Qwen3-8B Using Qwen templates for Qwen/Qwen3-8B Using Qwen templates for Qwen/Qwen3-8B Using Qwen templates for Qwen/Qwen3-8B Using Qwen templates for Qwen/Qwen3-8BUsing Qwen templates for Qwen/Qwen3-8B Using Qwen templates for Qwen/Qwen3-8BUsing Qwen templates for Qwen/Qwen3-8B Using Qwen templates for Qwen/Qwen3-8BUsing Qwen templates for Qwen/Qwen3-8B Using Qwen templates for Qwen/Qwen3-8B Using Qwen templates for Qwen/Qwen3-8BUsing Qwen templates for Qwen/Qwen3-8BUsing Qwen templates for Qwen/Qwen3-8B Using Qwen templates for Qwen/Qwen3-8B Using Qwen templates for Qwen/Qwen3-8BUsing Qwen templates for Qwen/Qwen3-8B Using Qwen templates for Qwen/Qwen3-8B Using Qwen templates for Qwen/Qwen3-8B Using Qwen templates for Qwen/Qwen3-8BUsing Qwen templates for Qwen/Qwen3-8B Using Qwen templates for Qwen/Qwen3-8B Using Qwen templates for Qwen/Qwen3-8B Using Qwen templates for Qwen/Qwen3-8B Using Qwen templates for Qwen/Qwen3-8B Using Qwen templates for Qwen/Qwen3-8B Using Qwen templates for Qwen/Qwen3-8B Using Qwen templates for Qwen/Qwen3-8B Using Qwen templates for Qwen/Qwen3-8B Using Qwen templates for Qwen/Qwen3-8B Using Qwen templates for Qwen/Qwen3-8B Using Qwen templates for Qwen/Qwen3-8B Using Qwen templates for Qwen/Qwen3-8B Using Qwen templates for Qwen/Qwen3-8B Using Qwen templates for Qwen/Qwen3-8B Using Qwen templates for Qwen/Qwen3-8B Using Qwen templates for Qwen/Qwen3-8B Using Qwen templates for Qwen/Qwen3-8B Using Qwen templates for Qwen/Qwen3-8B Using Qwen templates for Qwen/Qwen3-8B Using Qwen templates for Qwen/Qwen3-8B Using Prefix Tree collator Using Prefix Tree collatorUsing Prefix Tree collator Using Prefix Tree collatorUsing Prefix Tree collatorUsing Prefix Tree collator Using Prefix Tree collator Using Prefix Tree collator args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume/runs/Apr15_11-24-38_batch-block1-3833 args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume/runs/Apr15_11-24-38_batch-block1-3833 args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume/runs/Apr15_11-24-38_batch-block1-3833 args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume/runs/Apr15_11-24-38_batch-block1-3833 args.report_to: ['wandb'] args.report_to: ['wandb']args.logging_dir: runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume/runs/Apr15_11-24-38_batch-block1-3833 args.logging_dir: runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume/runs/Apr15_11-24-38_batch-block1-3833 args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume/runs/Apr15_11-24-38_batch-block1-3833 args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume/runs/Apr15_11-24-38_batch-block1-3833 Using Prefix Tree collatorUsing Prefix Tree collator Using Prefix Tree collator Using Prefix Tree collatorUsing Prefix Tree collator Using Prefix Tree collatorUsing Prefix Tree collator Using Prefix Tree collator args.report_to: ['wandb']args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume/runs/Apr15_11-24-34_batch-block1-0070args.logging_dir: runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume/runs/Apr15_11-24-34_batch-block1-0070 args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume/runs/Apr15_11-24-34_batch-block1-0070 args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume/runs/Apr15_11-24-34_batch-block1-0070 args.report_to: ['wandb'] args.report_to: ['wandb']args.logging_dir: runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume/runs/Apr15_11-24-34_batch-block1-0070 args.logging_dir: runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume/runs/Apr15_11-24-34_batch-block1-0070 args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume/runs/Apr15_11-24-34_batch-block1-0070 args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume/runs/Apr15_11-24-34_batch-block1-0070 Using Prefix Tree collatorUsing Prefix Tree collator Using Prefix Tree collatorUsing Prefix Tree collator Using Prefix Tree collator Using Prefix Tree collatorUsing Prefix Tree collator Using Prefix Tree collator args.report_to: ['wandb'] args.report_to: ['wandb']args.logging_dir: runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume/runs/Apr15_11-24-34_batch-block1-3273args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume/runs/Apr15_11-24-34_batch-block1-3273 args.logging_dir: runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume/runs/Apr15_11-24-34_batch-block1-3273 args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume/runs/Apr15_11-24-34_batch-block1-3273 args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume/runs/Apr15_11-24-34_batch-block1-3273 args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume/runs/Apr15_11-24-34_batch-block1-3273 args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume/runs/Apr15_11-24-34_batch-block1-3273 args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume/runs/Apr15_11-24-34_batch-block1-3273 Using Prefix Tree collatorUsing Prefix Tree collator Using Prefix Tree collator Using Prefix Tree collatorUsing Prefix Tree collator Using Prefix Tree collatorUsing Prefix Tree collator Using Prefix Tree collator args.report_to: ['wandb']args.report_to: ['wandb']args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume/runs/Apr15_11-24-34_batch-block1-0075args.logging_dir: runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume/runs/Apr15_11-24-34_batch-block1-0075args.logging_dir: runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume/runs/Apr15_11-24-34_batch-block1-0075 args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume/runs/Apr15_11-24-34_batch-block1-0075 args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume/runs/Apr15_11-24-34_batch-block1-0075 args.report_to: ['wandb']args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume/runs/Apr15_11-24-34_batch-block1-0075args.logging_dir: runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume/runs/Apr15_11-24-34_batch-block1-0075args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume/runs/Apr15_11-24-34_batch-block1-0075 Using Prefix Tree collator Using Prefix Tree collatorUsing Prefix Tree collator Using Prefix Tree collator Using Prefix Tree collatorUsing Prefix Tree collatorUsing Prefix Tree collator Using Prefix Tree collator args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume/runs/Apr15_11-24-34_batch-block1-3859 args.report_to: ['wandb'] args.report_to: ['wandb']args.logging_dir: runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume/runs/Apr15_11-24-34_batch-block1-3859 args.logging_dir: runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume/runs/Apr15_11-24-34_batch-block1-3859 args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume/runs/Apr15_11-24-34_batch-block1-3859 args.report_to: ['wandb']args.report_to: ['wandb'] args.report_to: ['wandb']args.logging_dir: runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume/runs/Apr15_11-24-34_batch-block1-3859 args.logging_dir: runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume/runs/Apr15_11-24-34_batch-block1-3859 args.logging_dir: runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume/runs/Apr15_11-24-34_batch-block1-3859 args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume/runs/Apr15_11-24-34_batch-block1-3859 Using Prefix Tree collatorUsing Prefix Tree collator Using Prefix Tree collator Using Prefix Tree collatorUsing Prefix Tree collatorUsing Prefix Tree collator Using Prefix Tree collator Using Prefix Tree collator args.report_to: ['wandb'] args.report_to: ['wandb']args.logging_dir: runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume/runs/Apr15_11-24-35_batch-block1-0069 args.logging_dir: runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume/runs/Apr15_11-24-35_batch-block1-0069 args.report_to: ['wandb']args.report_to: ['wandb']args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume/runs/Apr15_11-24-35_batch-block1-0069args.logging_dir: runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume/runs/Apr15_11-24-35_batch-block1-0069args.logging_dir: runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume/runs/Apr15_11-24-35_batch-block1-0069 args.report_to: ['wandb']args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume/runs/Apr15_11-24-35_batch-block1-0069args.logging_dir: runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume/runs/Apr15_11-24-35_batch-block1-0069 args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume/runs/Apr15_11-24-35_batch-block1-0069 Using Prefix Tree collator Using Prefix Tree collator Using Prefix Tree collator Using Prefix Tree collatorUsing Prefix Tree collator Using Prefix Tree collator Using Prefix Tree collator Using Prefix Tree collator Using Prefix Tree collatorUsing Prefix Tree collatorUsing Prefix Tree collatorUsing Prefix Tree collator args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume/runs/Apr15_11-24-35_batch-block1-3534 Using Prefix Tree collatorUsing Prefix Tree collatorUsing Prefix Tree collator Using Prefix Tree collator args.report_to: ['wandb'] args.report_to: ['wandb']args.logging_dir: runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume/runs/Apr15_11-24-35_batch-block1-3534 args.logging_dir: runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume/runs/Apr15_11-24-35_batch-block1-3534 args.report_to: ['wandb'] args.report_to: ['wandb']args.logging_dir: runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume/runs/Apr15_11-24-35_batch-block1-3534 args.logging_dir: runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume/runs/Apr15_11-24-35_batch-block1-3534 args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume/runs/Apr15_11-24-35_batch-block1-3534 args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume/runs/Apr15_11-24-35_batch-block1-3534 args.report_to: ['wandb'] args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume/runs/Apr15_11-24-35_batch-block1-3534 args.report_to: ['wandb']args.logging_dir: runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume/runs/Apr15_11-24-35_batch-block1-0086args.report_to: ['wandb']args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume/runs/Apr15_11-24-35_batch-block1-0086args.logging_dir: runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume/runs/Apr15_11-24-35_batch-block1-0086 args.logging_dir: runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume/runs/Apr15_11-24-35_batch-block1-0086 args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume/runs/Apr15_11-24-35_batch-block1-0086 args.report_to: ['wandb']args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume/runs/Apr15_11-24-35_batch-block1-0086args.logging_dir: runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume/runs/Apr15_11-24-35_batch-block1-0086 args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume/runs/Apr15_11-24-35_batch-block1-0086 Parameter Offload: Total persistent parameters: 308224 in 145 params Final batch size: 1, sequence length: 5686 Attention mask shape: torch.Size([1, 1, 5686, 5686]) Position ids shape: torch.Size([1, 5686]) Input IDs shape: torch.Size([1, 5686]) Labels shape: torch.Size([1, 5686]) Final batch size: 1, sequence length: 6986 Attention mask shape: torch.Size([1, 1, 6986, 6986]) Position ids shape: torch.Size([1, 6986]) Input IDs shape: torch.Size([1, 6986]) Labels shape: torch.Size([1, 6986]) Final batch size: 1, sequence length: 6362 Attention mask shape: torch.Size([1, 1, 6362, 6362]) Position ids shape: torch.Size([1, 6362]) Input IDs shape: torch.Size([1, 6362]) Labels shape: torch.Size([1, 6362]) Final batch size: 1, sequence length: 11304 Attention mask shape: torch.Size([1, 1, 11304, 11304]) Position ids shape: torch.Size([1, 11304]) Input IDs shape: torch.Size([1, 11304]) Labels shape: torch.Size([1, 11304]) Final batch size: 1, sequence length: 10523 Attention mask shape: torch.Size([1, 1, 10523, 10523]) Position ids shape: torch.Size([1, 10523]) Input IDs shape: torch.Size([1, 10523]) Labels shape: torch.Size([1, 10523]) Final batch size: 1, sequence length: 12847 Attention mask shape: torch.Size([1, 1, 12847, 12847]) Position ids shape: torch.Size([1, 12847]) Input IDs shape: torch.Size([1, 12847]) Labels shape: torch.Size([1, 12847]) Final batch size: 1, sequence length: 13224 Attention mask shape: torch.Size([1, 1, 13224, 13224]) Position ids shape: torch.Size([1, 13224]) Input IDs shape: torch.Size([1, 13224]) Labels shape: torch.Size([1, 13224]) Final batch size: 1, sequence length: 14168 Attention mask shape: torch.Size([1, 1, 14168, 14168]) Position ids shape: torch.Size([1, 14168]) Input IDs shape: torch.Size([1, 14168]) Labels shape: torch.Size([1, 14168]) Final batch size: 1, sequence length: 14304 Attention mask shape: torch.Size([1, 1, 14304, 14304]) Position ids shape: torch.Size([1, 14304]) Input IDs shape: torch.Size([1, 14304]) Labels shape: torch.Size([1, 14304]) Final batch size: 1, sequence length: 15652 Attention mask shape: torch.Size([1, 1, 15652, 15652]) Position ids shape: torch.Size([1, 15652]) Input IDs shape: torch.Size([1, 15652]) Labels shape: torch.Size([1, 15652]) Final batch size: 1, sequence length: 13872 Attention mask shape: torch.Size([1, 1, 13872, 13872]) Position ids shape: torch.Size([1, 13872]) Input IDs shape: torch.Size([1, 13872]) Labels shape: torch.Size([1, 13872]) Final batch size: 1, sequence length: 15621 Attention mask shape: torch.Size([1, 1, 15621, 15621]) Position ids shape: torch.Size([1, 15621]) Input IDs shape: torch.Size([1, 15621]) Labels shape: torch.Size([1, 15621]) Final batch size: 1, sequence length: 16378 Attention mask shape: torch.Size([1, 1, 16378, 16378]) Position ids shape: torch.Size([1, 16378]) Input IDs shape: torch.Size([1, 16378]) Labels shape: torch.Size([1, 16378]) Final batch size: 1, sequence length: 16501 Attention mask shape: torch.Size([1, 1, 16501, 16501]) Position ids shape: torch.Size([1, 16501]) Input IDs shape: torch.Size([1, 16501]) Labels shape: torch.Size([1, 16501]) Final batch size: 1, sequence length: 16514 Attention mask shape: torch.Size([1, 1, 16514, 16514]) Position ids shape: torch.Size([1, 16514]) Input IDs shape: torch.Size([1, 16514]) Labels shape: torch.Size([1, 16514]) Final batch size: 1, sequence length: 17213 Attention mask shape: torch.Size([1, 1, 17213, 17213]) Position ids shape: torch.Size([1, 17213]) Input IDs shape: torch.Size([1, 17213]) Labels shape: torch.Size([1, 17213]) Final batch size: 1, sequence length: 16440 Attention mask shape: torch.Size([1, 1, 16440, 16440]) Position ids shape: torch.Size([1, 16440]) Input IDs shape: torch.Size([1, 16440]) Labels shape: torch.Size([1, 16440]) Final batch size: 1, sequence length: 17852 Attention mask shape: torch.Size([1, 1, 17852, 17852]) Position ids shape: torch.Size([1, 17852]) Input IDs shape: torch.Size([1, 17852]) Labels shape: torch.Size([1, 17852]) Final batch size: 1, sequence length: 18246 Attention mask shape: torch.Size([1, 1, 18246, 18246]) Position ids shape: torch.Size([1, 18246]) Input IDs shape: torch.Size([1, 18246]) Labels shape: torch.Size([1, 18246]) Final batch size: 1, sequence length: 17773 Attention mask shape: torch.Size([1, 1, 17773, 17773]) Position ids shape: torch.Size([1, 17773]) Input IDs shape: torch.Size([1, 17773]) Labels shape: torch.Size([1, 17773]) Final batch size: 1, sequence length: 18155 Attention mask shape: torch.Size([1, 1, 18155, 18155]) Position ids shape: torch.Size([1, 18155]) Input IDs shape: torch.Size([1, 18155]) Labels shape: torch.Size([1, 18155]) Final batch size: 1, sequence length: 18289 Attention mask shape: torch.Size([1, 1, 18289, 18289]) Position ids shape: torch.Size([1, 18289]) Input IDs shape: torch.Size([1, 18289]) Labels shape: torch.Size([1, 18289]) Final batch size: 1, sequence length: 20766 Attention mask shape: torch.Size([1, 1, 20766, 20766]) Position ids shape: torch.Size([1, 20766]) Input IDs shape: torch.Size([1, 20766]) Labels shape: torch.Size([1, 20766]) Final batch size: 1, sequence length: 20813 Attention mask shape: torch.Size([1, 1, 20813, 20813]) Position ids shape: torch.Size([1, 20813]) Input IDs shape: torch.Size([1, 20813]) Labels shape: torch.Size([1, 20813]) Final batch size: 1, sequence length: 21562 Attention mask shape: torch.Size([1, 1, 21562, 21562]) Position ids shape: torch.Size([1, 21562]) Input IDs shape: torch.Size([1, 21562]) Labels shape: torch.Size([1, 21562]) Final batch size: 1, sequence length: 21316 Attention mask shape: torch.Size([1, 1, 21316, 21316]) Position ids shape: torch.Size([1, 21316]) Input IDs shape: torch.Size([1, 21316]) Labels shape: torch.Size([1, 21316]) Final batch size: 1, sequence length: 21705 Attention mask shape: torch.Size([1, 1, 21705, 21705]) Position ids shape: torch.Size([1, 21705]) Input IDs shape: torch.Size([1, 21705]) Labels shape: torch.Size([1, 21705]) Final batch size: 1, sequence length: 21607 Attention mask shape: torch.Size([1, 1, 21607, 21607]) Position ids shape: torch.Size([1, 21607]) Input IDs shape: torch.Size([1, 21607]) Labels shape: torch.Size([1, 21607]) Final batch size: 1, sequence length: 22152 Attention mask shape: torch.Size([1, 1, 22152, 22152]) Position ids shape: torch.Size([1, 22152]) Input IDs shape: torch.Size([1, 22152]) Labels shape: torch.Size([1, 22152]) Final batch size: 1, sequence length: 22197 Attention mask shape: torch.Size([1, 1, 22197, 22197]) Position ids shape: torch.Size([1, 22197]) Input IDs shape: torch.Size([1, 22197]) Labels shape: torch.Size([1, 22197]) Final batch size: 1, sequence length: 22784 Attention mask shape: torch.Size([1, 1, 22784, 22784]) Position ids shape: torch.Size([1, 22784]) Input IDs shape: torch.Size([1, 22784]) Labels shape: torch.Size([1, 22784]) Final batch size: 1, sequence length: 18545 Attention mask shape: torch.Size([1, 1, 18545, 18545]) Position ids shape: torch.Size([1, 18545]) Input IDs shape: torch.Size([1, 18545]) Labels shape: torch.Size([1, 18545]) Final batch size: 1, sequence length: 21163 Attention mask shape: torch.Size([1, 1, 21163, 21163]) Position ids shape: torch.Size([1, 21163]) Input IDs shape: torch.Size([1, 21163]) Labels shape: torch.Size([1, 21163]) Final batch size: 1, sequence length: 20176 Attention mask shape: torch.Size([1, 1, 20176, 20176]) Position ids shape: torch.Size([1, 20176]) Input IDs shape: torch.Size([1, 20176]) Labels shape: torch.Size([1, 20176]) Final batch size: 1, sequence length: 21962 Attention mask shape: torch.Size([1, 1, 21962, 21962]) Position ids shape: torch.Size([1, 21962]) Input IDs shape: torch.Size([1, 21962]) Labels shape: torch.Size([1, 21962]) Final batch size: 1, sequence length: 25053 Attention mask shape: torch.Size([1, 1, 25053, 25053]) Position ids shape: torch.Size([1, 25053]) Input IDs shape: torch.Size([1, 25053]) Labels shape: torch.Size([1, 25053]) Final batch size: 1, sequence length: 21207 Attention mask shape: torch.Size([1, 1, 21207, 21207]) Position ids shape: torch.Size([1, 21207]) Input IDs shape: torch.Size([1, 21207]) Labels shape: torch.Size([1, 21207]) Final batch size: 1, sequence length: 17910 Attention mask shape: torch.Size([1, 1, 17910, 17910]) Position ids shape: torch.Size([1, 17910]) Input IDs shape: torch.Size([1, 17910]) Labels shape: torch.Size([1, 17910]) Final batch size: 1, sequence length: 24365 Attention mask shape: torch.Size([1, 1, 24365, 24365]) Position ids shape: torch.Size([1, 24365]) Input IDs shape: torch.Size([1, 24365]) Labels shape: torch.Size([1, 24365]) Final batch size: 1, sequence length: 5911 Attention mask shape: torch.Size([1, 1, 5911, 5911]) Position ids shape: torch.Size([1, 5911]) Input IDs shape: torch.Size([1, 5911]) Labels shape: torch.Size([1, 5911]) Final batch size: 1, sequence length: 22117 Attention mask shape: torch.Size([1, 1, 22117, 22117]) Position ids shape: torch.Size([1, 22117]) Input IDs shape: torch.Size([1, 22117]) Labels shape: torch.Size([1, 22117]) Final batch size: 1, sequence length: 26494 Attention mask shape: torch.Size([1, 1, 26494, 26494]) Position ids shape: torch.Size([1, 26494]) Input IDs shape: torch.Size([1, 26494]) Labels shape: torch.Size([1, 26494]) Final batch size: 1, sequence length: 23338 Attention mask shape: torch.Size([1, 1, 23338, 23338]) Position ids shape: torch.Size([1, 23338]) Input IDs shape: torch.Size([1, 23338]) Labels shape: torch.Size([1, 23338]) Final batch size: 1, sequence length: 19962 Attention mask shape: torch.Size([1, 1, 19962, 19962]) Position ids shape: torch.Size([1, 19962]) Input IDs shape: torch.Size([1, 19962]) Labels shape: torch.Size([1, 19962]) Final batch size: 1, sequence length: 25880 Attention mask shape: torch.Size([1, 1, 25880, 25880]) Position ids shape: torch.Size([1, 25880]) Input IDs shape: torch.Size([1, 25880]) Labels shape: torch.Size([1, 25880]) Final batch size: 1, sequence length: 20101 Attention mask shape: torch.Size([1, 1, 20101, 20101]) Position ids shape: torch.Size([1, 20101]) Input IDs shape: torch.Size([1, 20101]) Labels shape: torch.Size([1, 20101]) Final batch size: 1, sequence length: 26921 Attention mask shape: torch.Size([1, 1, 26921, 26921]) Position ids shape: torch.Size([1, 26921]) Input IDs shape: torch.Size([1, 26921]) Labels shape: torch.Size([1, 26921]) Final batch size: 1, sequence length: 12421 Attention mask shape: torch.Size([1, 1, 12421, 12421]) Position ids shape: torch.Size([1, 12421]) Input IDs shape: torch.Size([1, 12421]) Labels shape: torch.Size([1, 12421]) Final batch size: 1, sequence length: 21672 Attention mask shape: torch.Size([1, 1, 21672, 21672]) Position ids shape: torch.Size([1, 21672]) Input IDs shape: torch.Size([1, 21672]) Labels shape: torch.Size([1, 21672]) Final batch size: 1, sequence length: 26303 Attention mask shape: torch.Size([1, 1, 26303, 26303]) Position ids shape: torch.Size([1, 26303]) Input IDs shape: torch.Size([1, 26303]) Labels shape: torch.Size([1, 26303]) Final batch size: 1, sequence length: 27633 Attention mask shape: torch.Size([1, 1, 27633, 27633]) Position ids shape: torch.Size([1, 27633]) Input IDs shape: torch.Size([1, 27633]) Labels shape: torch.Size([1, 27633]) Final batch size: 1, sequence length: 14496 Attention mask shape: torch.Size([1, 1, 14496, 14496]) Position ids shape: torch.Size([1, 14496]) Input IDs shape: torch.Size([1, 14496]) Labels shape: torch.Size([1, 14496]) Final batch size: 1, sequence length: 13031 Attention mask shape: torch.Size([1, 1, 13031, 13031]) Position ids shape: torch.Size([1, 13031]) Input IDs shape: torch.Size([1, 13031]) Labels shape: torch.Size([1, 13031]) Final batch size: 1, sequence length: 29404 Attention mask shape: torch.Size([1, 1, 29404, 29404]) Position ids shape: torch.Size([1, 29404]) Input IDs shape: torch.Size([1, 29404]) Labels shape: torch.Size([1, 29404]) Final batch size: 1, sequence length: 23975 Attention mask shape: torch.Size([1, 1, 23975, 23975]) Position ids shape: torch.Size([1, 23975]) Input IDs shape: torch.Size([1, 23975]) Labels shape: torch.Size([1, 23975]) Final batch size: 1, sequence length: 30592 Attention mask shape: torch.Size([1, 1, 30592, 30592]) Position ids shape: torch.Size([1, 30592]) Input IDs shape: torch.Size([1, 30592]) Labels shape: torch.Size([1, 30592]) Final batch size: 1, sequence length: 21581 Attention mask shape: torch.Size([1, 1, 21581, 21581]) Position ids shape: torch.Size([1, 21581]) Input IDs shape: torch.Size([1, 21581]) Labels shape: torch.Size([1, 21581]) Final batch size: 1, sequence length: 30965 Attention mask shape: torch.Size([1, 1, 30965, 30965]) Position ids shape: torch.Size([1, 30965]) Input IDs shape: torch.Size([1, 30965]) Labels shape: torch.Size([1, 30965]) Final batch size: 1, sequence length: 26138 Attention mask shape: torch.Size([1, 1, 26138, 26138]) Position ids shape: torch.Size([1, 26138]) Input IDs shape: torch.Size([1, 26138]) Labels shape: torch.Size([1, 26138]) Final batch size: 1, sequence length: 20184 Attention mask shape: torch.Size([1, 1, 20184, 20184]) Position ids shape: torch.Size([1, 20184]) Input IDs shape: torch.Size([1, 20184]) Labels shape: torch.Size([1, 20184]) Final batch size: 1, sequence length: 24121 Attention mask shape: torch.Size([1, 1, 24121, 24121]) Position ids shape: torch.Size([1, 24121]) Input IDs shape: torch.Size([1, 24121]) Labels shape: torch.Size([1, 24121]) Final batch size: 1, sequence length: 18470 Attention mask shape: torch.Size([1, 1, 18470, 18470]) Position ids shape: torch.Size([1, 18470]) Input IDs shape: torch.Size([1, 18470]) Labels shape: torch.Size([1, 18470]) Final batch size: 1, sequence length: 28523 Attention mask shape: torch.Size([1, 1, 28523, 28523]) Position ids shape: torch.Size([1, 28523]) Input IDs shape: torch.Size([1, 28523]) Labels shape: torch.Size([1, 28523]) Final batch size: 1, sequence length: 12224 Attention mask shape: torch.Size([1, 1, 12224, 12224]) Position ids shape: torch.Size([1, 12224]) Input IDs shape: torch.Size([1, 12224]) Labels shape: torch.Size([1, 12224]) Final batch size: 1, sequence length: 21766 Attention mask shape: torch.Size([1, 1, 21766, 21766]) Position ids shape: torch.Size([1, 21766]) Input IDs shape: torch.Size([1, 21766]) Labels shape: torch.Size([1, 21766]) Final batch size: 1, sequence length: 28897 Attention mask shape: torch.Size([1, 1, 28897, 28897]) Position ids shape: torch.Size([1, 28897]) Input IDs shape: torch.Size([1, 28897]) Labels shape: torch.Size([1, 28897]) Final batch size: 1, sequence length: 11515 Attention mask shape: torch.Size([1, 1, 11515, 11515]) Position ids shape: torch.Size([1, 11515]) Input IDs shape: torch.Size([1, 11515]) Labels shape: torch.Size([1, 11515]) Final batch size: 1, sequence length: 9704 Attention mask shape: torch.Size([1, 1, 9704, 9704]) Position ids shape: torch.Size([1, 9704]) Input IDs shape: torch.Size([1, 9704]) Labels shape: torch.Size([1, 9704]) Final batch size: 1, sequence length: 35100 Attention mask shape: torch.Size([1, 1, 35100, 35100]) Position ids shape: torch.Size([1, 35100]) Input IDs shape: torch.Size([1, 35100]) Labels shape: torch.Size([1, 35100]) Final batch size: 1, sequence length: 34469 Attention mask shape: torch.Size([1, 1, 34469, 34469]) Position ids shape: torch.Size([1, 34469]) Input IDs shape: torch.Size([1, 34469]) Labels shape: torch.Size([1, 34469]) Final batch size: 1, sequence length: 32473 Attention mask shape: torch.Size([1, 1, 32473, 32473]) Position ids shape: torch.Size([1, 32473]) Input IDs shape: torch.Size([1, 32473]) Labels shape: torch.Size([1, 32473]) Final batch size: 1, sequence length: 16057 Attention mask shape: torch.Size([1, 1, 16057, 16057]) Position ids shape: torch.Size([1, 16057]) Input IDs shape: torch.Size([1, 16057]) Labels shape: torch.Size([1, 16057]) Final batch size: 1, sequence length: 35485 Attention mask shape: torch.Size([1, 1, 35485, 35485]) Position ids shape: torch.Size([1, 35485]) Input IDs shape: torch.Size([1, 35485]) Labels shape: torch.Size([1, 35485]) Final batch size: 1, sequence length: 32596 Attention mask shape: torch.Size([1, 1, 32596, 32596]) Position ids shape: torch.Size([1, 32596]) Input IDs shape: torch.Size([1, 32596]) Labels shape: torch.Size([1, 32596]) Final batch size: 1, sequence length: 20198 Attention mask shape: torch.Size([1, 1, 20198, 20198]) Position ids shape: torch.Size([1, 20198]) Input IDs shape: torch.Size([1, 20198]) Labels shape: torch.Size([1, 20198]) Final batch size: 1, sequence length: 23558 Attention mask shape: torch.Size([1, 1, 23558, 23558]) Position ids shape: torch.Size([1, 23558]) Input IDs shape: torch.Size([1, 23558]) Labels shape: torch.Size([1, 23558]) Final batch size: 1, sequence length: 33482 Attention mask shape: torch.Size([1, 1, 33482, 33482]) Position ids shape: torch.Size([1, 33482]) Input IDs shape: torch.Size([1, 33482]) Labels shape: torch.Size([1, 33482]) Final batch size: 1, sequence length: 17456 Attention mask shape: torch.Size([1, 1, 17456, 17456]) Position ids shape: torch.Size([1, 17456]) Input IDs shape: torch.Size([1, 17456]) Labels shape: torch.Size([1, 17456]) Final batch size: 1, sequence length: 11608 Attention mask shape: torch.Size([1, 1, 11608, 11608]) Position ids shape: torch.Size([1, 11608]) Input IDs shape: torch.Size([1, 11608]) Labels shape: torch.Size([1, 11608]) Final batch size: 1, sequence length: 29481 Attention mask shape: torch.Size([1, 1, 29481, 29481]) Position ids shape: torch.Size([1, 29481]) Input IDs shape: torch.Size([1, 29481]) Labels shape: torch.Size([1, 29481]) Final batch size: 1, sequence length: 37980 Attention mask shape: torch.Size([1, 1, 37980, 37980]) Position ids shape: torch.Size([1, 37980]) Input IDs shape: torch.Size([1, 37980]) Labels shape: torch.Size([1, 37980]) Final batch size: 1, sequence length: 19620 Attention mask shape: torch.Size([1, 1, 19620, 19620]) Position ids shape: torch.Size([1, 19620]) Input IDs shape: torch.Size([1, 19620]) Labels shape: torch.Size([1, 19620]) Final batch size: 1, sequence length: 35030 Attention mask shape: torch.Size([1, 1, 35030, 35030]) Position ids shape: torch.Size([1, 35030]) Input IDs shape: torch.Size([1, 35030]) Labels shape: torch.Size([1, 35030]) Final batch size: 1, sequence length: 28773 Attention mask shape: torch.Size([1, 1, 28773, 28773]) Position ids shape: torch.Size([1, 28773]) Input IDs shape: torch.Size([1, 28773]) Labels shape: torch.Size([1, 28773]) Final batch size: 1, sequence length: 38927 Attention mask shape: torch.Size([1, 1, 38927, 38927]) Position ids shape: torch.Size([1, 38927]) Input IDs shape: torch.Size([1, 38927]) Labels shape: torch.Size([1, 38927]) Final batch size: 1, sequence length: 27702 Attention mask shape: torch.Size([1, 1, 27702, 27702]) Position ids shape: torch.Size([1, 27702]) Input IDs shape: torch.Size([1, 27702]) Labels shape: torch.Size([1, 27702]) Final batch size: 1, sequence length: 33685 Attention mask shape: torch.Size([1, 1, 33685, 33685]) Position ids shape: torch.Size([1, 33685]) Input IDs shape: torch.Size([1, 33685]) Labels shape: torch.Size([1, 33685]) Final batch size: 1, sequence length: 40526 Attention mask shape: torch.Size([1, 1, 40526, 40526]) Position ids shape: torch.Size([1, 40526]) Input IDs shape: torch.Size([1, 40526]) Labels shape: torch.Size([1, 40526]) Final batch size: 1, sequence length: 30072 Attention mask shape: torch.Size([1, 1, 30072, 30072]) Position ids shape: torch.Size([1, 30072]) Input IDs shape: torch.Size([1, 30072]) Labels shape: torch.Size([1, 30072]) Final batch size: 1, sequence length: 18126 Attention mask shape: torch.Size([1, 1, 18126, 18126]) Position ids shape: torch.Size([1, 18126]) Input IDs shape: torch.Size([1, 18126]) Labels shape: torch.Size([1, 18126]) Final batch size: 1, sequence length: 11403 Attention mask shape: torch.Size([1, 1, 11403, 11403]) Position ids shape: torch.Size([1, 11403]) Input IDs shape: torch.Size([1, 11403]) Labels shape: torch.Size([1, 11403]) Final batch size: 1, sequence length: 18044 Attention mask shape: torch.Size([1, 1, 18044, 18044]) Position ids shape: torch.Size([1, 18044]) Input IDs shape: torch.Size([1, 18044]) Labels shape: torch.Size([1, 18044]) Final batch size: 1, sequence length: 39589 Attention mask shape: torch.Size([1, 1, 39589, 39589]) Position ids shape: torch.Size([1, 39589]) Input IDs shape: torch.Size([1, 39589]) Labels shape: torch.Size([1, 39589]) Final batch size: 1, sequence length: 37992 Attention mask shape: torch.Size([1, 1, 37992, 37992]) Position ids shape: torch.Size([1, 37992]) Input IDs shape: torch.Size([1, 37992]) Labels shape: torch.Size([1, 37992]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 21491 Attention mask shape: torch.Size([1, 1, 21491, 21491]) Position ids shape: torch.Size([1, 21491]) Input IDs shape: torch.Size([1, 21491]) Labels shape: torch.Size([1, 21491]) Final batch size: 1, sequence length: 37040 Attention mask shape: torch.Size([1, 1, 37040, 37040]) Position ids shape: torch.Size([1, 37040]) Input IDs shape: torch.Size([1, 37040]) Labels shape: torch.Size([1, 37040]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 26215 Attention mask shape: torch.Size([1, 1, 26215, 26215]) Position ids shape: torch.Size([1, 26215]) Input IDs shape: torch.Size([1, 26215]) Labels shape: torch.Size([1, 26215]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 29537 Attention mask shape: torch.Size([1, 1, 29537, 29537]) Position ids shape: torch.Size([1, 29537]) Input IDs shape: torch.Size([1, 29537]) Labels shape: torch.Size([1, 29537]) Final batch size: 1, sequence length: 18023 Attention mask shape: torch.Size([1, 1, 18023, 18023]) Position ids shape: torch.Size([1, 18023]) Input IDs shape: torch.Size([1, 18023]) Labels shape: torch.Size([1, 18023]) Final batch size: 1, sequence length: 30789 Attention mask shape: torch.Size([1, 1, 30789, 30789]) Position ids shape: torch.Size([1, 30789]) Input IDs shape: torch.Size([1, 30789]) Labels shape: torch.Size([1, 30789]) Final batch size: 1, sequence length: 30772 Attention mask shape: torch.Size([1, 1, 30772, 30772]) Position ids shape: torch.Size([1, 30772]) Input IDs shape: torch.Size([1, 30772]) Labels shape: torch.Size([1, 30772]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 19538 Attention mask shape: torch.Size([1, 1, 19538, 19538]) Position ids shape: torch.Size([1, 19538]) Input IDs shape: torch.Size([1, 19538]) Labels shape: torch.Size([1, 19538]) Final batch size: 1, sequence length: 30859 Attention mask shape: torch.Size([1, 1, 30859, 30859]) Position ids shape: torch.Size([1, 30859]) Input IDs shape: torch.Size([1, 30859]) Labels shape: torch.Size([1, 30859]) Final batch size: 1, sequence length: 16257 Attention mask shape: torch.Size([1, 1, 16257, 16257]) Position ids shape: torch.Size([1, 16257]) Input IDs shape: torch.Size([1, 16257]) Labels shape: torch.Size([1, 16257]) Final batch size: 1, sequence length: 36580 Attention mask shape: torch.Size([1, 1, 36580, 36580]) Position ids shape: torch.Size([1, 36580]) Input IDs shape: torch.Size([1, 36580]) Labels shape: torch.Size([1, 36580]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 13622 Attention mask shape: torch.Size([1, 1, 13622, 13622]) Position ids shape: torch.Size([1, 13622]) Input IDs shape: torch.Size([1, 13622]) Labels shape: torch.Size([1, 13622]) Final batch size: 1, sequence length: 18454 Attention mask shape: torch.Size([1, 1, 18454, 18454]) Position ids shape: torch.Size([1, 18454]) Input IDs shape: torch.Size([1, 18454]) Labels shape: torch.Size([1, 18454]) Final batch size: 1, sequence length: 17870 Attention mask shape: torch.Size([1, 1, 17870, 17870]) Position ids shape: torch.Size([1, 17870]) Input IDs shape: torch.Size([1, 17870]) Labels shape: torch.Size([1, 17870]) Final batch size: 1, sequence length: 21250 Attention mask shape: torch.Size([1, 1, 21250, 21250]) Position ids shape: torch.Size([1, 21250]) Input IDs shape: torch.Size([1, 21250]) Labels shape: torch.Size([1, 21250]) Final batch size: 1, sequence length: 29875 Attention mask shape: torch.Size([1, 1, 29875, 29875]) Position ids shape: torch.Size([1, 29875]) Input IDs shape: torch.Size([1, 29875]) Labels shape: torch.Size([1, 29875]) Final batch size: 1, sequence length: 26635 Attention mask shape: torch.Size([1, 1, 26635, 26635]) Position ids shape: torch.Size([1, 26635]) Input IDs shape: torch.Size([1, 26635]) Labels shape: torch.Size([1, 26635]) Final batch size: 1, sequence length: 31745 Attention mask shape: torch.Size([1, 1, 31745, 31745]) Position ids shape: torch.Size([1, 31745]) Input IDs shape: torch.Size([1, 31745]) Labels shape: torch.Size([1, 31745]) Final batch size: 1, sequence length: 26708 Attention mask shape: torch.Size([1, 1, 26708, 26708]) Position ids shape: torch.Size([1, 26708]) Input IDs shape: torch.Size([1, 26708]) Labels shape: torch.Size([1, 26708]) Final batch size: 1, sequence length: 37866 Attention mask shape: torch.Size([1, 1, 37866, 37866]) Position ids shape: torch.Size([1, 37866]) Input IDs shape: torch.Size([1, 37866]) Labels shape: torch.Size([1, 37866]) Final batch size: 1, sequence length: 22309 Attention mask shape: torch.Size([1, 1, 22309, 22309]) Position ids shape: torch.Size([1, 22309]) Input IDs shape: torch.Size([1, 22309]) Labels shape: torch.Size([1, 22309]) Final batch size: 1, sequence length: 35077 Attention mask shape: torch.Size([1, 1, 35077, 35077]) Position ids shape: torch.Size([1, 35077]) Input IDs shape: torch.Size([1, 35077]) Labels shape: torch.Size([1, 35077]) Final batch size: 1, sequence length: 19702 Attention mask shape: torch.Size([1, 1, 19702, 19702]) Position ids shape: torch.Size([1, 19702]) Input IDs shape: torch.Size([1, 19702]) Labels shape: torch.Size([1, 19702]) Final batch size: 1, sequence length: 14730 Attention mask shape: torch.Size([1, 1, 14730, 14730]) Position ids shape: torch.Size([1, 14730]) Input IDs shape: torch.Size([1, 14730]) Labels shape: torch.Size([1, 14730]) Final batch size: 1, sequence length: 18606 Attention mask shape: torch.Size([1, 1, 18606, 18606]) Position ids shape: torch.Size([1, 18606]) Input IDs shape: torch.Size([1, 18606]) Labels shape: torch.Size([1, 18606]) Final batch size: 1, sequence length: 15993 Attention mask shape: torch.Size([1, 1, 15993, 15993]) Position ids shape: torch.Size([1, 15993]) Input IDs shape: torch.Size([1, 15993]) Labels shape: torch.Size([1, 15993]) Final batch size: 1, sequence length: 34684 Attention mask shape: torch.Size([1, 1, 34684, 34684]) Position ids shape: torch.Size([1, 34684]) Input IDs shape: torch.Size([1, 34684]) Labels shape: torch.Size([1, 34684]) Final batch size: 1, sequence length: 35952 Attention mask shape: torch.Size([1, 1, 35952, 35952]) Position ids shape: torch.Size([1, 35952]) Input IDs shape: torch.Size([1, 35952]) Labels shape: torch.Size([1, 35952]) Final batch size: 1, sequence length: 39661 Attention mask shape: torch.Size([1, 1, 39661, 39661]) Position ids shape: torch.Size([1, 39661]) Input IDs shape: torch.Size([1, 39661]) Labels shape: torch.Size([1, 39661]) Final batch size: 1, sequence length: 17811 Attention mask shape: torch.Size([1, 1, 17811, 17811]) Position ids shape: torch.Size([1, 17811]) Input IDs shape: torch.Size([1, 17811]) Labels shape: torch.Size([1, 17811]) Final batch size: 1, sequence length: 17623 Attention mask shape: torch.Size([1, 1, 17623, 17623]) Position ids shape: torch.Size([1, 17623]) Input IDs shape: torch.Size([1, 17623]) Labels shape: torch.Size([1, 17623]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 22945 Attention mask shape: torch.Size([1, 1, 22945, 22945]) Position ids shape: torch.Size([1, 22945]) Input IDs shape: torch.Size([1, 22945]) Labels shape: torch.Size([1, 22945]) Final batch size: 1, sequence length: 29464 Attention mask shape: torch.Size([1, 1, 29464, 29464]) Position ids shape: torch.Size([1, 29464]) Input IDs shape: torch.Size([1, 29464]) Labels shape: torch.Size([1, 29464]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 37407 Attention mask shape: torch.Size([1, 1, 37407, 37407]) Position ids shape: torch.Size([1, 37407]) Input IDs shape: torch.Size([1, 37407]) Labels shape: torch.Size([1, 37407]) Final batch size: 1, sequence length: 28810 Attention mask shape: torch.Size([1, 1, 28810, 28810]) Position ids shape: torch.Size([1, 28810]) Input IDs shape: torch.Size([1, 28810]) Labels shape: torch.Size([1, 28810]) Final batch size: 1, sequence length: 17971 Attention mask shape: torch.Size([1, 1, 17971, 17971]) Position ids shape: torch.Size([1, 17971]) Input IDs shape: torch.Size([1, 17971]) Labels shape: torch.Size([1, 17971]) Final batch size: 1, sequence length: 15830 Attention mask shape: torch.Size([1, 1, 15830, 15830]) Position ids shape: torch.Size([1, 15830]) Input IDs shape: torch.Size([1, 15830]) Labels shape: torch.Size([1, 15830]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 16270 Attention mask shape: torch.Size([1, 1, 16270, 16270]) Position ids shape: torch.Size([1, 16270]) Input IDs shape: torch.Size([1, 16270]) Labels shape: torch.Size([1, 16270]) Final batch size: 1, sequence length: 21936 Attention mask shape: torch.Size([1, 1, 21936, 21936]) Position ids shape: torch.Size([1, 21936]) Input IDs shape: torch.Size([1, 21936]) Labels shape: torch.Size([1, 21936]) Final batch size: 1, sequence length: 28814 Attention mask shape: torch.Size([1, 1, 28814, 28814]) Position ids shape: torch.Size([1, 28814]) Input IDs shape: torch.Size([1, 28814]) Labels shape: torch.Size([1, 28814]) Final batch size: 1, sequence length: 29639 Attention mask shape: torch.Size([1, 1, 29639, 29639]) Position ids shape: torch.Size([1, 29639]) Input IDs shape: torch.Size([1, 29639]) Labels shape: torch.Size([1, 29639]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 36716 Attention mask shape: torch.Size([1, 1, 36716, 36716]) Position ids shape: torch.Size([1, 36716]) Input IDs shape: torch.Size([1, 36716]) Labels shape: torch.Size([1, 36716]) Final batch size: 1, sequence length: 30428 Attention mask shape: torch.Size([1, 1, 30428, 30428]) Position ids shape: torch.Size([1, 30428]) Input IDs shape: torch.Size([1, 30428]) Labels shape: torch.Size([1, 30428]) Final batch size: 1, sequence length: 19028 Attention mask shape: torch.Size([1, 1, 19028, 19028]) Position ids shape: torch.Size([1, 19028]) Input IDs shape: torch.Size([1, 19028]) Labels shape: torch.Size([1, 19028]) Final batch size: 1, sequence length: 26665 Attention mask shape: torch.Size([1, 1, 26665, 26665]) Position ids shape: torch.Size([1, 26665]) Input IDs shape: torch.Size([1, 26665]) Labels shape: torch.Size([1, 26665]) Final batch size: 1, sequence length: 13903 Attention mask shape: torch.Size([1, 1, 13903, 13903]) Position ids shape: torch.Size([1, 13903]) Input IDs shape: torch.Size([1, 13903]) Labels shape: torch.Size([1, 13903]) Final batch size: 1, sequence length: 31381 Attention mask shape: torch.Size([1, 1, 31381, 31381]) Position ids shape: torch.Size([1, 31381]) Input IDs shape: torch.Size([1, 31381]) Labels shape: torch.Size([1, 31381]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 21805 Attention mask shape: torch.Size([1, 1, 21805, 21805]) Position ids shape: torch.Size([1, 21805]) Input IDs shape: torch.Size([1, 21805]) Labels shape: torch.Size([1, 21805]) Final batch size: 1, sequence length: 17778 Attention mask shape: torch.Size([1, 1, 17778, 17778]) Position ids shape: torch.Size([1, 17778]) Input IDs shape: torch.Size([1, 17778]) Labels shape: torch.Size([1, 17778]) Final batch size: 1, sequence length: 24232 Attention mask shape: torch.Size([1, 1, 24232, 24232]) Position ids shape: torch.Size([1, 24232]) Input IDs shape: torch.Size([1, 24232]) Labels shape: torch.Size([1, 24232]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 32024 Attention mask shape: torch.Size([1, 1, 32024, 32024]) Position ids shape: torch.Size([1, 32024]) Input IDs shape: torch.Size([1, 32024]) Labels shape: torch.Size([1, 32024]) Final batch size: 1, sequence length: 14104 Attention mask shape: torch.Size([1, 1, 14104, 14104]) Position ids shape: torch.Size([1, 14104]) Input IDs shape: torch.Size([1, 14104]) Labels shape: torch.Size([1, 14104]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 30135 Attention mask shape: torch.Size([1, 1, 30135, 30135]) Position ids shape: torch.Size([1, 30135]) Input IDs shape: torch.Size([1, 30135]) Labels shape: torch.Size([1, 30135]) Final batch size: 1, sequence length: 27281 Attention mask shape: torch.Size([1, 1, 27281, 27281]) Position ids shape: torch.Size([1, 27281]) Input IDs shape: torch.Size([1, 27281]) Labels shape: torch.Size([1, 27281]) Final batch size: 1, sequence length: 31420 Attention mask shape: torch.Size([1, 1, 31420, 31420]) Position ids shape: torch.Size([1, 31420]) Input IDs shape: torch.Size([1, 31420]) Labels shape: torch.Size([1, 31420]) Final batch size: 1, sequence length: 26930 Attention mask shape: torch.Size([1, 1, 26930, 26930]) Position ids shape: torch.Size([1, 26930]) Input IDs shape: torch.Size([1, 26930]) Labels shape: torch.Size([1, 26930]) Final batch size: 1, sequence length: 20765 Attention mask shape: torch.Size([1, 1, 20765, 20765]) Position ids shape: torch.Size([1, 20765]) Input IDs shape: torch.Size([1, 20765]) Labels shape: torch.Size([1, 20765]) Final batch size: 1, sequence length: 21071 Attention mask shape: torch.Size([1, 1, 21071, 21071]) Position ids shape: torch.Size([1, 21071]) Input IDs shape: torch.Size([1, 21071]) Labels shape: torch.Size([1, 21071]) Final batch size: 1, sequence length: 25388 Attention mask shape: torch.Size([1, 1, 25388, 25388]) Position ids shape: torch.Size([1, 25388]) Input IDs shape: torch.Size([1, 25388]) Labels shape: torch.Size([1, 25388]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 20057 Attention mask shape: torch.Size([1, 1, 20057, 20057]) Position ids shape: torch.Size([1, 20057]) Input IDs shape: torch.Size([1, 20057]) Labels shape: torch.Size([1, 20057]) Final batch size: 1, sequence length: 35879 Attention mask shape: torch.Size([1, 1, 35879, 35879]) Position ids shape: torch.Size([1, 35879]) Input IDs shape: torch.Size([1, 35879]) Labels shape: torch.Size([1, 35879]) Final batch size: 1, sequence length: 37397 Attention mask shape: torch.Size([1, 1, 37397, 37397]) Position ids shape: torch.Size([1, 37397]) Input IDs shape: torch.Size([1, 37397]) Labels shape: torch.Size([1, 37397]) Final batch size: 1, sequence length: 25032 Attention mask shape: torch.Size([1, 1, 25032, 25032]) Position ids shape: torch.Size([1, 25032]) Input IDs shape: torch.Size([1, 25032]) Labels shape: torch.Size([1, 25032]) Final batch size: 1, sequence length: 40579 Attention mask shape: torch.Size([1, 1, 40579, 40579]) Position ids shape: torch.Size([1, 40579]) Input IDs shape: torch.Size([1, 40579]) Labels shape: torch.Size([1, 40579]) Final batch size: 1, sequence length: 21006 Attention mask shape: torch.Size([1, 1, 21006, 21006]) Position ids shape: torch.Size([1, 21006]) Input IDs shape: torch.Size([1, 21006]) Labels shape: torch.Size([1, 21006]) Final batch size: 1, sequence length: 12200 Attention mask shape: torch.Size([1, 1, 12200, 12200]) Position ids shape: torch.Size([1, 12200]) Input IDs shape: torch.Size([1, 12200]) Labels shape: torch.Size([1, 12200]) Final batch size: 1, sequence length: 39778 Attention mask shape: torch.Size([1, 1, 39778, 39778]) Position ids shape: torch.Size([1, 39778]) Input IDs shape: torch.Size([1, 39778]) Labels shape: torch.Size([1, 39778]) Final batch size: 1, sequence length: 25383 Attention mask shape: torch.Size([1, 1, 25383, 25383]) Position ids shape: torch.Size([1, 25383]) Input IDs shape: torch.Size([1, 25383]) Labels shape: torch.Size([1, 25383]) Final batch size: 1, sequence length: 23187 Attention mask shape: torch.Size([1, 1, 23187, 23187]) Position ids shape: torch.Size([1, 23187]) Input IDs shape: torch.Size([1, 23187]) Labels shape: torch.Size([1, 23187]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 19668 Attention mask shape: torch.Size([1, 1, 19668, 19668]) Position ids shape: torch.Size([1, 19668]) Input IDs shape: torch.Size([1, 19668]) Labels shape: torch.Size([1, 19668]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 18566 Attention mask shape: torch.Size([1, 1, 18566, 18566]) Position ids shape: torch.Size([1, 18566]) Input IDs shape: torch.Size([1, 18566]) Labels shape: torch.Size([1, 18566]) Final batch size: 1, sequence length: 37890 Attention mask shape: torch.Size([1, 1, 37890, 37890]) Position ids shape: torch.Size([1, 37890]) Input IDs shape: torch.Size([1, 37890]) Labels shape: torch.Size([1, 37890]) Final batch size: 1, sequence length: 36415 Attention mask shape: torch.Size([1, 1, 36415, 36415]) Position ids shape: torch.Size([1, 36415]) Input IDs shape: torch.Size([1, 36415]) Labels shape: torch.Size([1, 36415]) Final batch size: 1, sequence length: 9251 Attention mask shape: torch.Size([1, 1, 9251, 9251]) Position ids shape: torch.Size([1, 9251]) Input IDs shape: torch.Size([1, 9251]) Labels shape: torch.Size([1, 9251]) Final batch size: 1, sequence length: 32541 Attention mask shape: torch.Size([1, 1, 32541, 32541]) Position ids shape: torch.Size([1, 32541]) Input IDs shape: torch.Size([1, 32541]) Labels shape: torch.Size([1, 32541]) Final batch size: 1, sequence length: 14308 Attention mask shape: torch.Size([1, 1, 14308, 14308]) Position ids shape: torch.Size([1, 14308]) Input IDs shape: torch.Size([1, 14308]) Labels shape: torch.Size([1, 14308]) Final batch size: 1, sequence length: 37530 Attention mask shape: torch.Size([1, 1, 37530, 37530]) Position ids shape: torch.Size([1, 37530]) Input IDs shape: torch.Size([1, 37530]) Labels shape: torch.Size([1, 37530]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 39587 Attention mask shape: torch.Size([1, 1, 39587, 39587]) Position ids shape: torch.Size([1, 39587]) Input IDs shape: torch.Size([1, 39587]) Labels shape: torch.Size([1, 39587]) Final batch size: 1, sequence length: 36647 Attention mask shape: torch.Size([1, 1, 36647, 36647]) Position ids shape: torch.Size([1, 36647]) Input IDs shape: torch.Size([1, 36647]) Labels shape: torch.Size([1, 36647]) Final batch size: 1, sequence length: 39579 Attention mask shape: torch.Size([1, 1, 39579, 39579]) Position ids shape: torch.Size([1, 39579]) Input IDs shape: torch.Size([1, 39579]) Labels shape: torch.Size([1, 39579]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) {'loss': 0.4208, 'grad_norm': 2.1084181933960093, 'learning_rate': 0.0, 'num_tokens': -inf, 'epoch': 0.12} Final batch size: 1, sequence length: 4858 Attention mask shape: torch.Size([1, 1, 4858, 4858]) Position ids shape: torch.Size([1, 4858]) Input IDs shape: torch.Size([1, 4858]) Labels shape: torch.Size([1, 4858]) Final batch size: 1, sequence length: 6316 Attention mask shape: torch.Size([1, 1, 6316, 6316]) Position ids shape: torch.Size([1, 6316]) Input IDs shape: torch.Size([1, 6316]) Labels shape: torch.Size([1, 6316]) Final batch size: 1, sequence length: 7360 Attention mask shape: torch.Size([1, 1, 7360, 7360]) Position ids shape: torch.Size([1, 7360]) Input IDs shape: torch.Size([1, 7360]) Labels shape: torch.Size([1, 7360]) Final batch size: 1, sequence length: 12075 Attention mask shape: torch.Size([1, 1, 12075, 12075]) Position ids shape: torch.Size([1, 12075]) Input IDs shape: torch.Size([1, 12075]) Labels shape: torch.Size([1, 12075]) Final batch size: 1, sequence length: 12945 Attention mask shape: torch.Size([1, 1, 12945, 12945]) Position ids shape: torch.Size([1, 12945]) Input IDs shape: torch.Size([1, 12945]) Labels shape: torch.Size([1, 12945]) Final batch size: 1, sequence length: 11448 Attention mask shape: torch.Size([1, 1, 11448, 11448]) Position ids shape: torch.Size([1, 11448]) Input IDs shape: torch.Size([1, 11448]) Labels shape: torch.Size([1, 11448]) Final batch size: 1, sequence length: 15189 Attention mask shape: torch.Size([1, 1, 15189, 15189]) Position ids shape: torch.Size([1, 15189]) Input IDs shape: torch.Size([1, 15189]) Labels shape: torch.Size([1, 15189]) Final batch size: 1, sequence length: 14891 Attention mask shape: torch.Size([1, 1, 14891, 14891]) Position ids shape: torch.Size([1, 14891]) Input IDs shape: torch.Size([1, 14891]) Labels shape: torch.Size([1, 14891]) Final batch size: 1, sequence length: 12846 Attention mask shape: torch.Size([1, 1, 12846, 12846]) Position ids shape: torch.Size([1, 12846]) Input IDs shape: torch.Size([1, 12846]) Labels shape: torch.Size([1, 12846]) Final batch size: 1, sequence length: 16961 Attention mask shape: torch.Size([1, 1, 16961, 16961]) Position ids shape: torch.Size([1, 16961]) Input IDs shape: torch.Size([1, 16961]) Labels shape: torch.Size([1, 16961]) Final batch size: 1, sequence length: 17246 Attention mask shape: torch.Size([1, 1, 17246, 17246]) Position ids shape: torch.Size([1, 17246]) Input IDs shape: torch.Size([1, 17246]) Labels shape: torch.Size([1, 17246]) Final batch size: 1, sequence length: 16658 Attention mask shape: torch.Size([1, 1, 16658, 16658]) Position ids shape: torch.Size([1, 16658]) Input IDs shape: torch.Size([1, 16658]) Labels shape: torch.Size([1, 16658]) Final batch size: 1, sequence length: 14330 Attention mask shape: torch.Size([1, 1, 14330, 14330]) Position ids shape: torch.Size([1, 14330]) Input IDs shape: torch.Size([1, 14330]) Labels shape: torch.Size([1, 14330]) Final batch size: 1, sequence length: 16831 Attention mask shape: torch.Size([1, 1, 16831, 16831]) Position ids shape: torch.Size([1, 16831]) Input IDs shape: torch.Size([1, 16831]) Labels shape: torch.Size([1, 16831]) Final batch size: 1, sequence length: 16391 Attention mask shape: torch.Size([1, 1, 16391, 16391]) Position ids shape: torch.Size([1, 16391]) Input IDs shape: torch.Size([1, 16391]) Labels shape: torch.Size([1, 16391]) Final batch size: 1, sequence length: 16326 Attention mask shape: torch.Size([1, 1, 16326, 16326]) Position ids shape: torch.Size([1, 16326]) Input IDs shape: torch.Size([1, 16326]) Labels shape: torch.Size([1, 16326]) Final batch size: 1, sequence length: 18341 Attention mask shape: torch.Size([1, 1, 18341, 18341]) Position ids shape: torch.Size([1, 18341]) Input IDs shape: torch.Size([1, 18341]) Labels shape: torch.Size([1, 18341]) Final batch size: 1, sequence length: 18400 Attention mask shape: torch.Size([1, 1, 18400, 18400]) Position ids shape: torch.Size([1, 18400]) Input IDs shape: torch.Size([1, 18400]) Labels shape: torch.Size([1, 18400]) Final batch size: 1, sequence length: 19332 Attention mask shape: torch.Size([1, 1, 19332, 19332]) Position ids shape: torch.Size([1, 19332]) Input IDs shape: torch.Size([1, 19332]) Labels shape: torch.Size([1, 19332]) Final batch size: 1, sequence length: 18424 Attention mask shape: torch.Size([1, 1, 18424, 18424]) Position ids shape: torch.Size([1, 18424]) Input IDs shape: torch.Size([1, 18424]) Labels shape: torch.Size([1, 18424]) Final batch size: 1, sequence length: 19999 Attention mask shape: torch.Size([1, 1, 19999, 19999]) Position ids shape: torch.Size([1, 19999]) Input IDs shape: torch.Size([1, 19999]) Labels shape: torch.Size([1, 19999]) Final batch size: 1, sequence length: 21122 Attention mask shape: torch.Size([1, 1, 21122, 21122]) Position ids shape: torch.Size([1, 21122]) Input IDs shape: torch.Size([1, 21122]) Labels shape: torch.Size([1, 21122]) Final batch size: 1, sequence length: 18014 Attention mask shape: torch.Size([1, 1, 18014, 18014]) Position ids shape: torch.Size([1, 18014]) Input IDs shape: torch.Size([1, 18014]) Labels shape: torch.Size([1, 18014]) Final batch size: 1, sequence length: 20912 Attention mask shape: torch.Size([1, 1, 20912, 20912]) Position ids shape: torch.Size([1, 20912]) Input IDs shape: torch.Size([1, 20912]) Labels shape: torch.Size([1, 20912]) Final batch size: 1, sequence length: 20888 Attention mask shape: torch.Size([1, 1, 20888, 20888]) Position ids shape: torch.Size([1, 20888]) Input IDs shape: torch.Size([1, 20888]) Labels shape: torch.Size([1, 20888]) Final batch size: 1, sequence length: 16536Final batch size: 1, sequence length: 19597 Attention mask shape: torch.Size([1, 1, 16536, 16536])Attention mask shape: torch.Size([1, 1, 19597, 19597]) Position ids shape: torch.Size([1, 19597])Position ids shape: torch.Size([1, 16536]) Input IDs shape: torch.Size([1, 19597])Input IDs shape: torch.Size([1, 16536]) Labels shape: torch.Size([1, 19597])Labels shape: torch.Size([1, 16536]) Final batch size: 1, sequence length: 21617 Attention mask shape: torch.Size([1, 1, 21617, 21617]) Position ids shape: torch.Size([1, 21617]) Input IDs shape: torch.Size([1, 21617]) Labels shape: torch.Size([1, 21617]) Final batch size: 1, sequence length: 20118 Attention mask shape: torch.Size([1, 1, 20118, 20118]) Position ids shape: torch.Size([1, 20118]) Input IDs shape: torch.Size([1, 20118]) Labels shape: torch.Size([1, 20118]) Final batch size: 1, sequence length: 21314 Attention mask shape: torch.Size([1, 1, 21314, 21314]) Position ids shape: torch.Size([1, 21314]) Input IDs shape: torch.Size([1, 21314]) Labels shape: torch.Size([1, 21314]) Final batch size: 1, sequence length: 18393 Attention mask shape: torch.Size([1, 1, 18393, 18393]) Position ids shape: torch.Size([1, 18393]) Input IDs shape: torch.Size([1, 18393]) Labels shape: torch.Size([1, 18393]) Final batch size: 1, sequence length: 13638 Attention mask shape: torch.Size([1, 1, 13638, 13638]) Position ids shape: torch.Size([1, 13638]) Input IDs shape: torch.Size([1, 13638]) Labels shape: torch.Size([1, 13638]) Final batch size: 1, sequence length: 22854 Attention mask shape: torch.Size([1, 1, 22854, 22854]) Position ids shape: torch.Size([1, 22854]) Input IDs shape: torch.Size([1, 22854]) Labels shape: torch.Size([1, 22854]) Final batch size: 1, sequence length: 25575 Attention mask shape: torch.Size([1, 1, 25575, 25575]) Position ids shape: torch.Size([1, 25575]) Input IDs shape: torch.Size([1, 25575]) Labels shape: torch.Size([1, 25575]) Final batch size: 1, sequence length: 21739 Attention mask shape: torch.Size([1, 1, 21739, 21739]) Position ids shape: torch.Size([1, 21739]) Input IDs shape: torch.Size([1, 21739]) Labels shape: torch.Size([1, 21739]) Final batch size: 1, sequence length: 25909 Attention mask shape: torch.Size([1, 1, 25909, 25909]) Position ids shape: torch.Size([1, 25909]) Input IDs shape: torch.Size([1, 25909]) Labels shape: torch.Size([1, 25909]) Final batch size: 1, sequence length: 25042 Attention mask shape: torch.Size([1, 1, 25042, 25042]) Position ids shape: torch.Size([1, 25042]) Input IDs shape: torch.Size([1, 25042]) Labels shape: torch.Size([1, 25042]) Final batch size: 1, sequence length: 9341 Attention mask shape: torch.Size([1, 1, 9341, 9341]) Position ids shape: torch.Size([1, 9341]) Input IDs shape: torch.Size([1, 9341]) Labels shape: torch.Size([1, 9341]) Final batch size: 1, sequence length: 23614 Attention mask shape: torch.Size([1, 1, 23614, 23614]) Position ids shape: torch.Size([1, 23614]) Input IDs shape: torch.Size([1, 23614]) Labels shape: torch.Size([1, 23614]) Final batch size: 1, sequence length: 24597 Attention mask shape: torch.Size([1, 1, 24597, 24597]) Position ids shape: torch.Size([1, 24597]) Input IDs shape: torch.Size([1, 24597]) Labels shape: torch.Size([1, 24597]) Final batch size: 1, sequence length: 21611 Attention mask shape: torch.Size([1, 1, 21611, 21611]) Position ids shape: torch.Size([1, 21611]) Input IDs shape: torch.Size([1, 21611]) Labels shape: torch.Size([1, 21611]) Final batch size: 1, sequence length: 15026 Attention mask shape: torch.Size([1, 1, 15026, 15026]) Position ids shape: torch.Size([1, 15026]) Input IDs shape: torch.Size([1, 15026]) Labels shape: torch.Size([1, 15026]) Final batch size: 1, sequence length: 15232 Attention mask shape: torch.Size([1, 1, 15232, 15232]) Position ids shape: torch.Size([1, 15232]) Input IDs shape: torch.Size([1, 15232]) Labels shape: torch.Size([1, 15232]) Final batch size: 1, sequence length: 21677 Attention mask shape: torch.Size([1, 1, 21677, 21677]) Position ids shape: torch.Size([1, 21677]) Input IDs shape: torch.Size([1, 21677]) Labels shape: torch.Size([1, 21677]) Final batch size: 1, sequence length: 24090 Attention mask shape: torch.Size([1, 1, 24090, 24090]) Position ids shape: torch.Size([1, 24090]) Input IDs shape: torch.Size([1, 24090]) Labels shape: torch.Size([1, 24090]) Final batch size: 1, sequence length: 24342 Attention mask shape: torch.Size([1, 1, 24342, 24342]) Position ids shape: torch.Size([1, 24342]) Input IDs shape: torch.Size([1, 24342]) Labels shape: torch.Size([1, 24342]) Final batch size: 1, sequence length: 19885 Attention mask shape: torch.Size([1, 1, 19885, 19885]) Position ids shape: torch.Size([1, 19885]) Input IDs shape: torch.Size([1, 19885]) Labels shape: torch.Size([1, 19885]) Final batch size: 1, sequence length: 28348 Attention mask shape: torch.Size([1, 1, 28348, 28348]) Position ids shape: torch.Size([1, 28348]) Input IDs shape: torch.Size([1, 28348]) Labels shape: torch.Size([1, 28348]) Final batch size: 1, sequence length: 26218 Attention mask shape: torch.Size([1, 1, 26218, 26218]) Position ids shape: torch.Size([1, 26218]) Input IDs shape: torch.Size([1, 26218]) Labels shape: torch.Size([1, 26218]) Final batch size: 1, sequence length: 12328 Attention mask shape: torch.Size([1, 1, 12328, 12328]) Position ids shape: torch.Size([1, 12328]) Input IDs shape: torch.Size([1, 12328]) Labels shape: torch.Size([1, 12328]) Final batch size: 1, sequence length: 20702 Attention mask shape: torch.Size([1, 1, 20702, 20702]) Position ids shape: torch.Size([1, 20702]) Input IDs shape: torch.Size([1, 20702]) Labels shape: torch.Size([1, 20702]) Final batch size: 1, sequence length: 21725 Attention mask shape: torch.Size([1, 1, 21725, 21725]) Position ids shape: torch.Size([1, 21725]) Input IDs shape: torch.Size([1, 21725]) Labels shape: torch.Size([1, 21725]) Final batch size: 1, sequence length: 29730 Attention mask shape: torch.Size([1, 1, 29730, 29730]) Position ids shape: torch.Size([1, 29730]) Input IDs shape: torch.Size([1, 29730]) Labels shape: torch.Size([1, 29730]) Final batch size: 1, sequence length: 6839 Attention mask shape: torch.Size([1, 1, 6839, 6839]) Position ids shape: torch.Size([1, 6839]) Input IDs shape: torch.Size([1, 6839]) Labels shape: torch.Size([1, 6839]) Final batch size: 1, sequence length: 27795 Attention mask shape: torch.Size([1, 1, 27795, 27795]) Position ids shape: torch.Size([1, 27795]) Input IDs shape: torch.Size([1, 27795]) Labels shape: torch.Size([1, 27795]) Final batch size: 1, sequence length: 30366 Attention mask shape: torch.Size([1, 1, 30366, 30366]) Position ids shape: torch.Size([1, 30366]) Input IDs shape: torch.Size([1, 30366]) Labels shape: torch.Size([1, 30366]) Final batch size: 1, sequence length: 13646 Attention mask shape: torch.Size([1, 1, 13646, 13646]) Position ids shape: torch.Size([1, 13646]) Input IDs shape: torch.Size([1, 13646]) Labels shape: torch.Size([1, 13646]) Final batch size: 1, sequence length: 20619 Attention mask shape: torch.Size([1, 1, 20619, 20619]) Position ids shape: torch.Size([1, 20619]) Input IDs shape: torch.Size([1, 20619]) Labels shape: torch.Size([1, 20619]) Final batch size: 1, sequence length: 16611 Attention mask shape: torch.Size([1, 1, 16611, 16611]) Position ids shape: torch.Size([1, 16611]) Input IDs shape: torch.Size([1, 16611]) Labels shape: torch.Size([1, 16611]) Final batch size: 1, sequence length: 28678 Attention mask shape: torch.Size([1, 1, 28678, 28678]) Position ids shape: torch.Size([1, 28678]) Input IDs shape: torch.Size([1, 28678]) Labels shape: torch.Size([1, 28678]) Final batch size: 1, sequence length: 30766 Attention mask shape: torch.Size([1, 1, 30766, 30766]) Position ids shape: torch.Size([1, 30766]) Input IDs shape: torch.Size([1, 30766]) Labels shape: torch.Size([1, 30766]) Final batch size: 1, sequence length: 32034 Attention mask shape: torch.Size([1, 1, 32034, 32034]) Position ids shape: torch.Size([1, 32034]) Input IDs shape: torch.Size([1, 32034]) Labels shape: torch.Size([1, 32034]) Final batch size: 1, sequence length: 30410 Attention mask shape: torch.Size([1, 1, 30410, 30410]) Position ids shape: torch.Size([1, 30410]) Input IDs shape: torch.Size([1, 30410]) Labels shape: torch.Size([1, 30410]) Final batch size: 1, sequence length: 17780 Attention mask shape: torch.Size([1, 1, 17780, 17780]) Position ids shape: torch.Size([1, 17780]) Input IDs shape: torch.Size([1, 17780]) Labels shape: torch.Size([1, 17780]) Final batch size: 1, sequence length: 20915 Attention mask shape: torch.Size([1, 1, 20915, 20915]) Position ids shape: torch.Size([1, 20915]) Input IDs shape: torch.Size([1, 20915]) Labels shape: torch.Size([1, 20915]) Final batch size: 1, sequence length: 25698 Attention mask shape: torch.Size([1, 1, 25698, 25698]) Position ids shape: torch.Size([1, 25698]) Input IDs shape: torch.Size([1, 25698]) Labels shape: torch.Size([1, 25698]) Final batch size: 1, sequence length: 22217 Attention mask shape: torch.Size([1, 1, 22217, 22217]) Position ids shape: torch.Size([1, 22217]) Input IDs shape: torch.Size([1, 22217]) Labels shape: torch.Size([1, 22217]) Final batch size: 1, sequence length: 30796 Attention mask shape: torch.Size([1, 1, 30796, 30796]) Position ids shape: torch.Size([1, 30796]) Input IDs shape: torch.Size([1, 30796]) Labels shape: torch.Size([1, 30796]) Final batch size: 1, sequence length: 16403 Attention mask shape: torch.Size([1, 1, 16403, 16403]) Position ids shape: torch.Size([1, 16403]) Input IDs shape: torch.Size([1, 16403]) Labels shape: torch.Size([1, 16403]) Final batch size: 1, sequence length: 21623 Attention mask shape: torch.Size([1, 1, 21623, 21623]) Position ids shape: torch.Size([1, 21623]) Input IDs shape: torch.Size([1, 21623]) Labels shape: torch.Size([1, 21623]) Final batch size: 1, sequence length: 32308 Attention mask shape: torch.Size([1, 1, 32308, 32308]) Position ids shape: torch.Size([1, 32308]) Input IDs shape: torch.Size([1, 32308]) Labels shape: torch.Size([1, 32308]) Final batch size: 1, sequence length: 30917 Attention mask shape: torch.Size([1, 1, 30917, 30917]) Position ids shape: torch.Size([1, 30917]) Input IDs shape: torch.Size([1, 30917]) Labels shape: torch.Size([1, 30917]) Final batch size: 1, sequence length: 35781 Attention mask shape: torch.Size([1, 1, 35781, 35781]) Position ids shape: torch.Size([1, 35781]) Input IDs shape: torch.Size([1, 35781]) Labels shape: torch.Size([1, 35781]) Final batch size: 1, sequence length: 29202 Attention mask shape: torch.Size([1, 1, 29202, 29202]) Position ids shape: torch.Size([1, 29202]) Input IDs shape: torch.Size([1, 29202]) Labels shape: torch.Size([1, 29202]) Final batch size: 1, sequence length: 35628 Attention mask shape: torch.Size([1, 1, 35628, 35628]) Position ids shape: torch.Size([1, 35628]) Input IDs shape: torch.Size([1, 35628]) Labels shape: torch.Size([1, 35628]) Final batch size: 1, sequence length: 38362 Attention mask shape: torch.Size([1, 1, 38362, 38362]) Position ids shape: torch.Size([1, 38362]) Input IDs shape: torch.Size([1, 38362]) Labels shape: torch.Size([1, 38362]) Final batch size: 1, sequence length: 20028 Attention mask shape: torch.Size([1, 1, 20028, 20028]) Position ids shape: torch.Size([1, 20028]) Input IDs shape: torch.Size([1, 20028]) Labels shape: torch.Size([1, 20028]) Final batch size: 1, sequence length: 15726 Attention mask shape: torch.Size([1, 1, 15726, 15726]) Position ids shape: torch.Size([1, 15726]) Input IDs shape: torch.Size([1, 15726]) Labels shape: torch.Size([1, 15726]) Final batch size: 1, sequence length: 32630 Attention mask shape: torch.Size([1, 1, 32630, 32630]) Position ids shape: torch.Size([1, 32630]) Input IDs shape: torch.Size([1, 32630]) Labels shape: torch.Size([1, 32630]) Final batch size: 1, sequence length: 21562 Attention mask shape: torch.Size([1, 1, 21562, 21562]) Position ids shape: torch.Size([1, 21562]) Input IDs shape: torch.Size([1, 21562]) Labels shape: torch.Size([1, 21562]) Final batch size: 1, sequence length: 32239 Attention mask shape: torch.Size([1, 1, 32239, 32239]) Position ids shape: torch.Size([1, 32239]) Input IDs shape: torch.Size([1, 32239]) Labels shape: torch.Size([1, 32239]) Final batch size: 1, sequence length: 35697 Attention mask shape: torch.Size([1, 1, 35697, 35697]) Position ids shape: torch.Size([1, 35697]) Input IDs shape: torch.Size([1, 35697]) Labels shape: torch.Size([1, 35697]) Final batch size: 1, sequence length: 37195 Attention mask shape: torch.Size([1, 1, 37195, 37195]) Position ids shape: torch.Size([1, 37195]) Input IDs shape: torch.Size([1, 37195]) Labels shape: torch.Size([1, 37195]) Final batch size: 1, sequence length: 38420 Attention mask shape: torch.Size([1, 1, 38420, 38420]) Position ids shape: torch.Size([1, 38420]) Input IDs shape: torch.Size([1, 38420]) Labels shape: torch.Size([1, 38420]) Final batch size: 1, sequence length: 39608 Attention mask shape: torch.Size([1, 1, 39608, 39608]) Position ids shape: torch.Size([1, 39608]) Input IDs shape: torch.Size([1, 39608]) Labels shape: torch.Size([1, 39608]) Final batch size: 1, sequence length: 14070 Attention mask shape: torch.Size([1, 1, 14070, 14070]) Position ids shape: torch.Size([1, 14070]) Input IDs shape: torch.Size([1, 14070]) Labels shape: torch.Size([1, 14070]) Final batch size: 1, sequence length: 34802 Attention mask shape: torch.Size([1, 1, 34802, 34802]) Position ids shape: torch.Size([1, 34802]) Input IDs shape: torch.Size([1, 34802]) Labels shape: torch.Size([1, 34802]) Final batch size: 1, sequence length: 11896 Attention mask shape: torch.Size([1, 1, 11896, 11896]) Position ids shape: torch.Size([1, 11896]) Input IDs shape: torch.Size([1, 11896]) Labels shape: torch.Size([1, 11896]) Final batch size: 1, sequence length: 38190 Attention mask shape: torch.Size([1, 1, 38190, 38190]) Position ids shape: torch.Size([1, 38190]) Input IDs shape: torch.Size([1, 38190]) Labels shape: torch.Size([1, 38190]) Final batch size: 1, sequence length: 14828 Attention mask shape: torch.Size([1, 1, 14828, 14828]) Position ids shape: torch.Size([1, 14828]) Input IDs shape: torch.Size([1, 14828]) Labels shape: torch.Size([1, 14828]) Final batch size: 1, sequence length: 22768 Attention mask shape: torch.Size([1, 1, 22768, 22768]) Position ids shape: torch.Size([1, 22768]) Input IDs shape: torch.Size([1, 22768]) Labels shape: torch.Size([1, 22768]) Final batch size: 1, sequence length: 18568 Attention mask shape: torch.Size([1, 1, 18568, 18568]) Position ids shape: torch.Size([1, 18568]) Input IDs shape: torch.Size([1, 18568]) Labels shape: torch.Size([1, 18568]) Final batch size: 1, sequence length: 23626 Attention mask shape: torch.Size([1, 1, 23626, 23626]) Position ids shape: torch.Size([1, 23626]) Input IDs shape: torch.Size([1, 23626]) Labels shape: torch.Size([1, 23626]) Final batch size: 1, sequence length: 26840 Attention mask shape: torch.Size([1, 1, 26840, 26840]) Position ids shape: torch.Size([1, 26840]) Input IDs shape: torch.Size([1, 26840]) Labels shape: torch.Size([1, 26840]) Final batch size: 1, sequence length: 10426 Attention mask shape: torch.Size([1, 1, 10426, 10426]) Position ids shape: torch.Size([1, 10426]) Input IDs shape: torch.Size([1, 10426]) Labels shape: torch.Size([1, 10426]) Final batch size: 1, sequence length: 30651 Attention mask shape: torch.Size([1, 1, 30651, 30651]) Position ids shape: torch.Size([1, 30651]) Input IDs shape: torch.Size([1, 30651]) Labels shape: torch.Size([1, 30651]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 26287 Attention mask shape: torch.Size([1, 1, 26287, 26287]) Position ids shape: torch.Size([1, 26287]) Input IDs shape: torch.Size([1, 26287]) Labels shape: torch.Size([1, 26287]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 37529 Attention mask shape: torch.Size([1, 1, 37529, 37529]) Position ids shape: torch.Size([1, 37529]) Input IDs shape: torch.Size([1, 37529]) Labels shape: torch.Size([1, 37529]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 16304 Attention mask shape: torch.Size([1, 1, 16304, 16304]) Position ids shape: torch.Size([1, 16304]) Input IDs shape: torch.Size([1, 16304]) Labels shape: torch.Size([1, 16304]) Final batch size: 1, sequence length: 22199 Attention mask shape: torch.Size([1, 1, 22199, 22199]) Position ids shape: torch.Size([1, 22199]) Input IDs shape: torch.Size([1, 22199]) Labels shape: torch.Size([1, 22199]) Final batch size: 1, sequence length: 35914 Attention mask shape: torch.Size([1, 1, 35914, 35914]) Position ids shape: torch.Size([1, 35914]) Input IDs shape: torch.Size([1, 35914]) Labels shape: torch.Size([1, 35914]) Final batch size: 1, sequence length: 34785 Attention mask shape: torch.Size([1, 1, 34785, 34785]) Position ids shape: torch.Size([1, 34785]) Input IDs shape: torch.Size([1, 34785]) Labels shape: torch.Size([1, 34785]) Final batch size: 1, sequence length: 40088 Attention mask shape: torch.Size([1, 1, 40088, 40088]) Position ids shape: torch.Size([1, 40088]) Input IDs shape: torch.Size([1, 40088]) Labels shape: torch.Size([1, 40088]) Final batch size: 1, sequence length: 32559 Attention mask shape: torch.Size([1, 1, 32559, 32559]) Position ids shape: torch.Size([1, 32559]) Input IDs shape: torch.Size([1, 32559]) Labels shape: torch.Size([1, 32559]) Final batch size: 1, sequence length: 31143 Attention mask shape: torch.Size([1, 1, 31143, 31143]) Position ids shape: torch.Size([1, 31143]) Input IDs shape: torch.Size([1, 31143]) Labels shape: torch.Size([1, 31143]) Final batch size: 1, sequence length: 27819 Attention mask shape: torch.Size([1, 1, 27819, 27819]) Position ids shape: torch.Size([1, 27819]) Input IDs shape: torch.Size([1, 27819]) Labels shape: torch.Size([1, 27819]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 36638 Attention mask shape: torch.Size([1, 1, 36638, 36638]) Position ids shape: torch.Size([1, 36638]) Input IDs shape: torch.Size([1, 36638]) Labels shape: torch.Size([1, 36638]) Final batch size: 1, sequence length: 19494 Attention mask shape: torch.Size([1, 1, 19494, 19494]) Position ids shape: torch.Size([1, 19494]) Input IDs shape: torch.Size([1, 19494]) Labels shape: torch.Size([1, 19494]) Final batch size: 1, sequence length: 40763 Attention mask shape: torch.Size([1, 1, 40763, 40763]) Position ids shape: torch.Size([1, 40763]) Input IDs shape: torch.Size([1, 40763]) Labels shape: torch.Size([1, 40763]) Final batch size: 1, sequence length: 30835 Attention mask shape: torch.Size([1, 1, 30835, 30835]) Position ids shape: torch.Size([1, 30835]) Input IDs shape: torch.Size([1, 30835]) Labels shape: torch.Size([1, 30835]) Final batch size: 1, sequence length: 23936 Attention mask shape: torch.Size([1, 1, 23936, 23936]) Position ids shape: torch.Size([1, 23936]) Input IDs shape: torch.Size([1, 23936]) Labels shape: torch.Size([1, 23936]) Final batch size: 1, sequence length: 31316 Attention mask shape: torch.Size([1, 1, 31316, 31316]) Position ids shape: torch.Size([1, 31316]) Input IDs shape: torch.Size([1, 31316]) Labels shape: torch.Size([1, 31316]) Final batch size: 1, sequence length: 18859 Attention mask shape: torch.Size([1, 1, 18859, 18859]) Position ids shape: torch.Size([1, 18859]) Input IDs shape: torch.Size([1, 18859]) Labels shape: torch.Size([1, 18859]) Final batch size: 1, sequence length: 10198 Attention mask shape: torch.Size([1, 1, 10198, 10198]) Position ids shape: torch.Size([1, 10198]) Input IDs shape: torch.Size([1, 10198]) Labels shape: torch.Size([1, 10198]) Final batch size: 1, sequence length: 12605 Attention mask shape: torch.Size([1, 1, 12605, 12605]) Position ids shape: torch.Size([1, 12605]) Input IDs shape: torch.Size([1, 12605]) Labels shape: torch.Size([1, 12605]) Final batch size: 1, sequence length: 31754 Attention mask shape: torch.Size([1, 1, 31754, 31754]) Position ids shape: torch.Size([1, 31754]) Input IDs shape: torch.Size([1, 31754]) Labels shape: torch.Size([1, 31754]) Final batch size: 1, sequence length: 25573 Attention mask shape: torch.Size([1, 1, 25573, 25573]) Position ids shape: torch.Size([1, 25573]) Input IDs shape: torch.Size([1, 25573]) Labels shape: torch.Size([1, 25573]) Final batch size: 1, sequence length: 26269 Attention mask shape: torch.Size([1, 1, 26269, 26269]) Position ids shape: torch.Size([1, 26269]) Input IDs shape: torch.Size([1, 26269]) Labels shape: torch.Size([1, 26269]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 37170 Attention mask shape: torch.Size([1, 1, 37170, 37170]) Position ids shape: torch.Size([1, 37170]) Input IDs shape: torch.Size([1, 37170]) Labels shape: torch.Size([1, 37170]) {'loss': 0.4595, 'grad_norm': 2.3821542075582016, 'learning_rate': 2.5e-06, 'num_tokens': -inf, 'epoch': 0.25} Final batch size: 1, sequence length: 7998 Attention mask shape: torch.Size([1, 1, 7998, 7998]) Position ids shape: torch.Size([1, 7998]) Input IDs shape: torch.Size([1, 7998]) Labels shape: torch.Size([1, 7998]) Final batch size: 1, sequence length: 6925 Attention mask shape: torch.Size([1, 1, 6925, 6925]) Position ids shape: torch.Size([1, 6925]) Input IDs shape: torch.Size([1, 6925]) Labels shape: torch.Size([1, 6925]) Final batch size: 1, sequence length: 7402 Attention mask shape: torch.Size([1, 1, 7402, 7402]) Position ids shape: torch.Size([1, 7402]) Input IDs shape: torch.Size([1, 7402]) Labels shape: torch.Size([1, 7402]) Final batch size: 1, sequence length: 10102 Attention mask shape: torch.Size([1, 1, 10102, 10102]) Position ids shape: torch.Size([1, 10102]) Input IDs shape: torch.Size([1, 10102]) Labels shape: torch.Size([1, 10102]) Final batch size: 1, sequence length: 9452 Attention mask shape: torch.Size([1, 1, 9452, 9452]) Position ids shape: torch.Size([1, 9452]) Input IDs shape: torch.Size([1, 9452]) Labels shape: torch.Size([1, 9452]) Final batch size: 1, sequence length: 11947 Attention mask shape: torch.Size([1, 1, 11947, 11947]) Position ids shape: torch.Size([1, 11947]) Input IDs shape: torch.Size([1, 11947]) Labels shape: torch.Size([1, 11947]) Final batch size: 1, sequence length: 12719 Attention mask shape: torch.Size([1, 1, 12719, 12719]) Position ids shape: torch.Size([1, 12719]) Input IDs shape: torch.Size([1, 12719]) Labels shape: torch.Size([1, 12719]) Final batch size: 1, sequence length: 9243 Attention mask shape: torch.Size([1, 1, 9243, 9243]) Position ids shape: torch.Size([1, 9243]) Input IDs shape: torch.Size([1, 9243]) Labels shape: torch.Size([1, 9243]) Final batch size: 1, sequence length: 8432 Attention mask shape: torch.Size([1, 1, 8432, 8432]) Position ids shape: torch.Size([1, 8432]) Input IDs shape: torch.Size([1, 8432]) Labels shape: torch.Size([1, 8432]) Final batch size: 1, sequence length: 10862 Attention mask shape: torch.Size([1, 1, 10862, 10862]) Position ids shape: torch.Size([1, 10862]) Input IDs shape: torch.Size([1, 10862]) Labels shape: torch.Size([1, 10862]) Final batch size: 1, sequence length: 10804 Attention mask shape: torch.Size([1, 1, 10804, 10804]) Position ids shape: torch.Size([1, 10804]) Input IDs shape: torch.Size([1, 10804]) Labels shape: torch.Size([1, 10804]) Final batch size: 1, sequence length: 11623 Attention mask shape: torch.Size([1, 1, 11623, 11623]) Position ids shape: torch.Size([1, 11623]) Input IDs shape: torch.Size([1, 11623]) Labels shape: torch.Size([1, 11623]) Final batch size: 1, sequence length: 13209 Attention mask shape: torch.Size([1, 1, 13209, 13209]) Position ids shape: torch.Size([1, 13209]) Input IDs shape: torch.Size([1, 13209]) Labels shape: torch.Size([1, 13209]) Final batch size: 1, sequence length: 15886 Attention mask shape: torch.Size([1, 1, 15886, 15886]) Position ids shape: torch.Size([1, 15886]) Input IDs shape: torch.Size([1, 15886]) Labels shape: torch.Size([1, 15886]) Final batch size: 1, sequence length: 16060 Attention mask shape: torch.Size([1, 1, 16060, 16060]) Position ids shape: torch.Size([1, 16060]) Input IDs shape: torch.Size([1, 16060]) Labels shape: torch.Size([1, 16060]) Final batch size: 1, sequence length: 15053 Attention mask shape: torch.Size([1, 1, 15053, 15053]) Position ids shape: torch.Size([1, 15053]) Input IDs shape: torch.Size([1, 15053]) Labels shape: torch.Size([1, 15053]) Final batch size: 1, sequence length: 17370 Attention mask shape: torch.Size([1, 1, 17370, 17370]) Position ids shape: torch.Size([1, 17370]) Input IDs shape: torch.Size([1, 17370]) Labels shape: torch.Size([1, 17370]) Final batch size: 1, sequence length: 17376 Attention mask shape: torch.Size([1, 1, 17376, 17376]) Position ids shape: torch.Size([1, 17376]) Input IDs shape: torch.Size([1, 17376]) Labels shape: torch.Size([1, 17376]) Final batch size: 1, sequence length: 16536 Attention mask shape: torch.Size([1, 1, 16536, 16536]) Position ids shape: torch.Size([1, 16536]) Input IDs shape: torch.Size([1, 16536]) Labels shape: torch.Size([1, 16536]) Final batch size: 1, sequence length: 15794 Attention mask shape: torch.Size([1, 1, 15794, 15794]) Position ids shape: torch.Size([1, 15794]) Input IDs shape: torch.Size([1, 15794]) Labels shape: torch.Size([1, 15794]) Final batch size: 1, sequence length: 17918 Attention mask shape: torch.Size([1, 1, 17918, 17918]) Position ids shape: torch.Size([1, 17918]) Input IDs shape: torch.Size([1, 17918]) Labels shape: torch.Size([1, 17918]) Final batch size: 1, sequence length: 19609 Attention mask shape: torch.Size([1, 1, 19609, 19609]) Position ids shape: torch.Size([1, 19609]) Input IDs shape: torch.Size([1, 19609]) Labels shape: torch.Size([1, 19609]) Final batch size: 1, sequence length: 19935 Attention mask shape: torch.Size([1, 1, 19935, 19935]) Position ids shape: torch.Size([1, 19935]) Input IDs shape: torch.Size([1, 19935]) Labels shape: torch.Size([1, 19935]) Final batch size: 1, sequence length: 20185 Attention mask shape: torch.Size([1, 1, 20185, 20185]) Position ids shape: torch.Size([1, 20185]) Input IDs shape: torch.Size([1, 20185]) Labels shape: torch.Size([1, 20185]) Final batch size: 1, sequence length: 17767 Attention mask shape: torch.Size([1, 1, 17767, 17767]) Position ids shape: torch.Size([1, 17767]) Input IDs shape: torch.Size([1, 17767]) Labels shape: torch.Size([1, 17767]) Final batch size: 1, sequence length: 19629 Attention mask shape: torch.Size([1, 1, 19629, 19629]) Position ids shape: torch.Size([1, 19629]) Input IDs shape: torch.Size([1, 19629]) Labels shape: torch.Size([1, 19629]) Final batch size: 1, sequence length: 20554 Attention mask shape: torch.Size([1, 1, 20554, 20554]) Position ids shape: torch.Size([1, 20554]) Input IDs shape: torch.Size([1, 20554]) Labels shape: torch.Size([1, 20554]) Final batch size: 1, sequence length: 20056 Attention mask shape: torch.Size([1, 1, 20056, 20056]) Position ids shape: torch.Size([1, 20056]) Input IDs shape: torch.Size([1, 20056]) Labels shape: torch.Size([1, 20056]) Final batch size: 1, sequence length: 21980 Attention mask shape: torch.Size([1, 1, 21980, 21980]) Position ids shape: torch.Size([1, 21980]) Input IDs shape: torch.Size([1, 21980]) Labels shape: torch.Size([1, 21980]) Final batch size: 1, sequence length: 21421 Attention mask shape: torch.Size([1, 1, 21421, 21421]) Position ids shape: torch.Size([1, 21421]) Input IDs shape: torch.Size([1, 21421]) Labels shape: torch.Size([1, 21421]) Final batch size: 1, sequence length: 21117 Attention mask shape: torch.Size([1, 1, 21117, 21117]) Position ids shape: torch.Size([1, 21117]) Input IDs shape: torch.Size([1, 21117]) Labels shape: torch.Size([1, 21117]) Final batch size: 1, sequence length: 22797 Attention mask shape: torch.Size([1, 1, 22797, 22797]) Position ids shape: torch.Size([1, 22797]) Input IDs shape: torch.Size([1, 22797]) Labels shape: torch.Size([1, 22797]) Final batch size: 1, sequence length: 17625 Attention mask shape: torch.Size([1, 1, 17625, 17625]) Position ids shape: torch.Size([1, 17625]) Input IDs shape: torch.Size([1, 17625]) Labels shape: torch.Size([1, 17625]) Final batch size: 1, sequence length: 12006 Attention mask shape: torch.Size([1, 1, 12006, 12006]) Position ids shape: torch.Size([1, 12006]) Input IDs shape: torch.Size([1, 12006]) Labels shape: torch.Size([1, 12006]) Final batch size: 1, sequence length: 8498 Attention mask shape: torch.Size([1, 1, 8498, 8498]) Position ids shape: torch.Size([1, 8498]) Input IDs shape: torch.Size([1, 8498]) Labels shape: torch.Size([1, 8498]) Final batch size: 1, sequence length: 16141 Attention mask shape: torch.Size([1, 1, 16141, 16141]) Position ids shape: torch.Size([1, 16141]) Input IDs shape: torch.Size([1, 16141]) Labels shape: torch.Size([1, 16141]) Final batch size: 1, sequence length: 21771 Attention mask shape: torch.Size([1, 1, 21771, 21771]) Position ids shape: torch.Size([1, 21771]) Input IDs shape: torch.Size([1, 21771]) Labels shape: torch.Size([1, 21771]) Final batch size: 1, sequence length: 22683 Attention mask shape: torch.Size([1, 1, 22683, 22683]) Position ids shape: torch.Size([1, 22683]) Input IDs shape: torch.Size([1, 22683]) Labels shape: torch.Size([1, 22683]) Final batch size: 1, sequence length: 22208 Attention mask shape: torch.Size([1, 1, 22208, 22208]) Position ids shape: torch.Size([1, 22208]) Input IDs shape: torch.Size([1, 22208]) Labels shape: torch.Size([1, 22208]) Final batch size: 1, sequence length: 24515 Attention mask shape: torch.Size([1, 1, 24515, 24515]) Position ids shape: torch.Size([1, 24515]) Input IDs shape: torch.Size([1, 24515]) Labels shape: torch.Size([1, 24515]) Final batch size: 1, sequence length: 24308 Attention mask shape: torch.Size([1, 1, 24308, 24308]) Position ids shape: torch.Size([1, 24308]) Input IDs shape: torch.Size([1, 24308]) Labels shape: torch.Size([1, 24308]) Final batch size: 1, sequence length: 20524 Attention mask shape: torch.Size([1, 1, 20524, 20524]) Position ids shape: torch.Size([1, 20524]) Input IDs shape: torch.Size([1, 20524]) Labels shape: torch.Size([1, 20524]) Final batch size: 1, sequence length: 17665 Attention mask shape: torch.Size([1, 1, 17665, 17665]) Position ids shape: torch.Size([1, 17665]) Input IDs shape: torch.Size([1, 17665]) Labels shape: torch.Size([1, 17665]) Final batch size: 1, sequence length: 27179 Attention mask shape: torch.Size([1, 1, 27179, 27179]) Position ids shape: torch.Size([1, 27179]) Input IDs shape: torch.Size([1, 27179]) Labels shape: torch.Size([1, 27179]) Final batch size: 1, sequence length: 16439 Attention mask shape: torch.Size([1, 1, 16439, 16439]) Position ids shape: torch.Size([1, 16439]) Input IDs shape: torch.Size([1, 16439]) Labels shape: torch.Size([1, 16439]) Final batch size: 1, sequence length: 26449 Attention mask shape: torch.Size([1, 1, 26449, 26449]) Position ids shape: torch.Size([1, 26449]) Input IDs shape: torch.Size([1, 26449]) Labels shape: torch.Size([1, 26449]) Final batch size: 1, sequence length: 20816 Attention mask shape: torch.Size([1, 1, 20816, 20816]) Position ids shape: torch.Size([1, 20816]) Input IDs shape: torch.Size([1, 20816]) Labels shape: torch.Size([1, 20816]) Final batch size: 1, sequence length: 18991 Attention mask shape: torch.Size([1, 1, 18991, 18991]) Position ids shape: torch.Size([1, 18991]) Input IDs shape: torch.Size([1, 18991]) Labels shape: torch.Size([1, 18991]) Final batch size: 1, sequence length: 24433 Attention mask shape: torch.Size([1, 1, 24433, 24433]) Position ids shape: torch.Size([1, 24433]) Input IDs shape: torch.Size([1, 24433]) Labels shape: torch.Size([1, 24433]) Final batch size: 1, sequence length: 26142 Attention mask shape: torch.Size([1, 1, 26142, 26142]) Position ids shape: torch.Size([1, 26142]) Input IDs shape: torch.Size([1, 26142]) Labels shape: torch.Size([1, 26142]) Final batch size: 1, sequence length: 10857 Attention mask shape: torch.Size([1, 1, 10857, 10857]) Position ids shape: torch.Size([1, 10857]) Input IDs shape: torch.Size([1, 10857]) Labels shape: torch.Size([1, 10857]) Final batch size: 1, sequence length: 26121 Attention mask shape: torch.Size([1, 1, 26121, 26121]) Position ids shape: torch.Size([1, 26121]) Input IDs shape: torch.Size([1, 26121]) Labels shape: torch.Size([1, 26121]) Final batch size: 1, sequence length: 21152 Attention mask shape: torch.Size([1, 1, 21152, 21152]) Position ids shape: torch.Size([1, 21152]) Input IDs shape: torch.Size([1, 21152]) Labels shape: torch.Size([1, 21152]) Final batch size: 1, sequence length: 28164 Attention mask shape: torch.Size([1, 1, 28164, 28164]) Position ids shape: torch.Size([1, 28164]) Input IDs shape: torch.Size([1, 28164]) Labels shape: torch.Size([1, 28164]) Final batch size: 1, sequence length: 17363 Attention mask shape: torch.Size([1, 1, 17363, 17363]) Position ids shape: torch.Size([1, 17363]) Input IDs shape: torch.Size([1, 17363]) Labels shape: torch.Size([1, 17363]) Final batch size: 1, sequence length: 29628 Attention mask shape: torch.Size([1, 1, 29628, 29628]) Position ids shape: torch.Size([1, 29628]) Input IDs shape: torch.Size([1, 29628]) Labels shape: torch.Size([1, 29628]) Final batch size: 1, sequence length: 16564 Attention mask shape: torch.Size([1, 1, 16564, 16564]) Position ids shape: torch.Size([1, 16564]) Input IDs shape: torch.Size([1, 16564]) Labels shape: torch.Size([1, 16564]) Final batch size: 1, sequence length: 31507 Attention mask shape: torch.Size([1, 1, 31507, 31507]) Position ids shape: torch.Size([1, 31507]) Input IDs shape: torch.Size([1, 31507]) Labels shape: torch.Size([1, 31507]) Final batch size: 1, sequence length: 31339 Attention mask shape: torch.Size([1, 1, 31339, 31339]) Position ids shape: torch.Size([1, 31339]) Input IDs shape: torch.Size([1, 31339]) Labels shape: torch.Size([1, 31339]) Final batch size: 1, sequence length: 24802 Attention mask shape: torch.Size([1, 1, 24802, 24802]) Position ids shape: torch.Size([1, 24802]) Input IDs shape: torch.Size([1, 24802]) Labels shape: torch.Size([1, 24802]) Final batch size: 1, sequence length: 18122 Attention mask shape: torch.Size([1, 1, 18122, 18122]) Position ids shape: torch.Size([1, 18122]) Input IDs shape: torch.Size([1, 18122]) Labels shape: torch.Size([1, 18122]) Final batch size: 1, sequence length: 32752 Attention mask shape: torch.Size([1, 1, 32752, 32752]) Position ids shape: torch.Size([1, 32752]) Input IDs shape: torch.Size([1, 32752]) Labels shape: torch.Size([1, 32752]) Final batch size: 1, sequence length: 29392 Attention mask shape: torch.Size([1, 1, 29392, 29392]) Position ids shape: torch.Size([1, 29392]) Input IDs shape: torch.Size([1, 29392]) Labels shape: torch.Size([1, 29392]) Final batch size: 1, sequence length: 32466 Attention mask shape: torch.Size([1, 1, 32466, 32466]) Position ids shape: torch.Size([1, 32466]) Input IDs shape: torch.Size([1, 32466]) Labels shape: torch.Size([1, 32466]) Final batch size: 1, sequence length: 30197 Attention mask shape: torch.Size([1, 1, 30197, 30197]) Position ids shape: torch.Size([1, 30197]) Input IDs shape: torch.Size([1, 30197]) Labels shape: torch.Size([1, 30197]) Final batch size: 1, sequence length: 31525 Attention mask shape: torch.Size([1, 1, 31525, 31525]) Position ids shape: torch.Size([1, 31525]) Input IDs shape: torch.Size([1, 31525]) Labels shape: torch.Size([1, 31525]) Final batch size: 1, sequence length: 33894 Attention mask shape: torch.Size([1, 1, 33894, 33894]) Position ids shape: torch.Size([1, 33894]) Input IDs shape: torch.Size([1, 33894]) Labels shape: torch.Size([1, 33894]) Final batch size: 1, sequence length: 24527 Attention mask shape: torch.Size([1, 1, 24527, 24527]) Position ids shape: torch.Size([1, 24527]) Input IDs shape: torch.Size([1, 24527]) Labels shape: torch.Size([1, 24527]) Final batch size: 1, sequence length: 11107 Attention mask shape: torch.Size([1, 1, 11107, 11107]) Position ids shape: torch.Size([1, 11107]) Input IDs shape: torch.Size([1, 11107]) Labels shape: torch.Size([1, 11107]) Final batch size: 1, sequence length: 27061 Attention mask shape: torch.Size([1, 1, 27061, 27061]) Position ids shape: torch.Size([1, 27061]) Input IDs shape: torch.Size([1, 27061]) Labels shape: torch.Size([1, 27061]) Final batch size: 1, sequence length: 30181 Attention mask shape: torch.Size([1, 1, 30181, 30181]) Position ids shape: torch.Size([1, 30181]) Input IDs shape: torch.Size([1, 30181]) Labels shape: torch.Size([1, 30181]) Final batch size: 1, sequence length: 22762 Attention mask shape: torch.Size([1, 1, 22762, 22762]) Position ids shape: torch.Size([1, 22762]) Input IDs shape: torch.Size([1, 22762]) Labels shape: torch.Size([1, 22762]) Final batch size: 1, sequence length: 33794 Attention mask shape: torch.Size([1, 1, 33794, 33794]) Position ids shape: torch.Size([1, 33794]) Input IDs shape: torch.Size([1, 33794]) Labels shape: torch.Size([1, 33794]) Final batch size: 1, sequence length: 16837 Attention mask shape: torch.Size([1, 1, 16837, 16837]) Position ids shape: torch.Size([1, 16837]) Input IDs shape: torch.Size([1, 16837]) Labels shape: torch.Size([1, 16837]) Final batch size: 1, sequence length: 34711 Attention mask shape: torch.Size([1, 1, 34711, 34711]) Position ids shape: torch.Size([1, 34711]) Input IDs shape: torch.Size([1, 34711]) Labels shape: torch.Size([1, 34711]) Final batch size: 1, sequence length: 33087 Attention mask shape: torch.Size([1, 1, 33087, 33087]) Position ids shape: torch.Size([1, 33087]) Input IDs shape: torch.Size([1, 33087]) Labels shape: torch.Size([1, 33087]) Final batch size: 1, sequence length: 21143 Attention mask shape: torch.Size([1, 1, 21143, 21143]) Position ids shape: torch.Size([1, 21143]) Input IDs shape: torch.Size([1, 21143]) Labels shape: torch.Size([1, 21143]) Final batch size: 1, sequence length: 13942 Attention mask shape: torch.Size([1, 1, 13942, 13942]) Position ids shape: torch.Size([1, 13942]) Input IDs shape: torch.Size([1, 13942]) Labels shape: torch.Size([1, 13942]) Final batch size: 1, sequence length: 20274 Attention mask shape: torch.Size([1, 1, 20274, 20274]) Position ids shape: torch.Size([1, 20274]) Input IDs shape: torch.Size([1, 20274]) Labels shape: torch.Size([1, 20274]) Final batch size: 1, sequence length: 37343 Attention mask shape: torch.Size([1, 1, 37343, 37343]) Position ids shape: torch.Size([1, 37343]) Input IDs shape: torch.Size([1, 37343]) Labels shape: torch.Size([1, 37343]) Final batch size: 1, sequence length: 34405 Attention mask shape: torch.Size([1, 1, 34405, 34405]) Position ids shape: torch.Size([1, 34405]) Input IDs shape: torch.Size([1, 34405]) Labels shape: torch.Size([1, 34405]) Final batch size: 1, sequence length: 34512 Attention mask shape: torch.Size([1, 1, 34512, 34512]) Position ids shape: torch.Size([1, 34512]) Input IDs shape: torch.Size([1, 34512]) Labels shape: torch.Size([1, 34512]) Final batch size: 1, sequence length: 32891 Attention mask shape: torch.Size([1, 1, 32891, 32891]) Position ids shape: torch.Size([1, 32891]) Input IDs shape: torch.Size([1, 32891]) Labels shape: torch.Size([1, 32891]) Final batch size: 1, sequence length: 37555 Attention mask shape: torch.Size([1, 1, 37555, 37555]) Position ids shape: torch.Size([1, 37555]) Input IDs shape: torch.Size([1, 37555]) Labels shape: torch.Size([1, 37555]) Final batch size: 1, sequence length: 31561 Attention mask shape: torch.Size([1, 1, 31561, 31561]) Position ids shape: torch.Size([1, 31561]) Input IDs shape: torch.Size([1, 31561]) Labels shape: torch.Size([1, 31561]) Final batch size: 1, sequence length: 31875 Attention mask shape: torch.Size([1, 1, 31875, 31875]) Position ids shape: torch.Size([1, 31875]) Input IDs shape: torch.Size([1, 31875]) Labels shape: torch.Size([1, 31875]) Final batch size: 1, sequence length: 18229 Attention mask shape: torch.Size([1, 1, 18229, 18229]) Position ids shape: torch.Size([1, 18229]) Input IDs shape: torch.Size([1, 18229]) Labels shape: torch.Size([1, 18229]) Final batch size: 1, sequence length: 38010 Attention mask shape: torch.Size([1, 1, 38010, 38010]) Position ids shape: torch.Size([1, 38010]) Input IDs shape: torch.Size([1, 38010]) Labels shape: torch.Size([1, 38010]) Final batch size: 1, sequence length: 35523 Attention mask shape: torch.Size([1, 1, 35523, 35523]) Position ids shape: torch.Size([1, 35523]) Input IDs shape: torch.Size([1, 35523]) Labels shape: torch.Size([1, 35523]) Final batch size: 1, sequence length: 16716 Attention mask shape: torch.Size([1, 1, 16716, 16716]) Position ids shape: torch.Size([1, 16716]) Input IDs shape: torch.Size([1, 16716]) Labels shape: torch.Size([1, 16716]) Final batch size: 1, sequence length: 29216 Attention mask shape: torch.Size([1, 1, 29216, 29216]) Position ids shape: torch.Size([1, 29216]) Input IDs shape: torch.Size([1, 29216]) Labels shape: torch.Size([1, 29216]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 33367 Attention mask shape: torch.Size([1, 1, 33367, 33367]) Position ids shape: torch.Size([1, 33367]) Input IDs shape: torch.Size([1, 33367]) Labels shape: torch.Size([1, 33367]) Final batch size: 1, sequence length: 17181 Attention mask shape: torch.Size([1, 1, 17181, 17181]) Position ids shape: torch.Size([1, 17181]) Input IDs shape: torch.Size([1, 17181]) Labels shape: torch.Size([1, 17181]) Final batch size: 1, sequence length: 40910 Attention mask shape: torch.Size([1, 1, 40910, 40910]) Position ids shape: torch.Size([1, 40910]) Input IDs shape: torch.Size([1, 40910]) Labels shape: torch.Size([1, 40910]) Final batch size: 1, sequence length: 31518 Attention mask shape: torch.Size([1, 1, 31518, 31518]) Position ids shape: torch.Size([1, 31518]) Input IDs shape: torch.Size([1, 31518]) Labels shape: torch.Size([1, 31518]) Final batch size: 1, sequence length: 16433 Attention mask shape: torch.Size([1, 1, 16433, 16433]) Position ids shape: torch.Size([1, 16433]) Input IDs shape: torch.Size([1, 16433]) Labels shape: torch.Size([1, 16433]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 13978 Attention mask shape: torch.Size([1, 1, 13978, 13978]) Position ids shape: torch.Size([1, 13978]) Input IDs shape: torch.Size([1, 13978]) Labels shape: torch.Size([1, 13978]) Final batch size: 1, sequence length: 36450 Attention mask shape: torch.Size([1, 1, 36450, 36450]) Position ids shape: torch.Size([1, 36450]) Input IDs shape: torch.Size([1, 36450]) Labels shape: torch.Size([1, 36450]) Final batch size: 1, sequence length: 36464 Attention mask shape: torch.Size([1, 1, 36464, 36464]) Position ids shape: torch.Size([1, 36464]) Input IDs shape: torch.Size([1, 36464]) Labels shape: torch.Size([1, 36464]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 30134 Attention mask shape: torch.Size([1, 1, 30134, 30134]) Position ids shape: torch.Size([1, 30134]) Input IDs shape: torch.Size([1, 30134]) Labels shape: torch.Size([1, 30134]) Final batch size: 1, sequence length: 37349 Attention mask shape: torch.Size([1, 1, 37349, 37349]) Position ids shape: torch.Size([1, 37349]) Input IDs shape: torch.Size([1, 37349]) Labels shape: torch.Size([1, 37349]) Final batch size: 1, sequence length: 21859 Attention mask shape: torch.Size([1, 1, 21859, 21859]) Position ids shape: torch.Size([1, 21859]) Input IDs shape: torch.Size([1, 21859]) Labels shape: torch.Size([1, 21859]) Final batch size: 1, sequence length: 29042 Attention mask shape: torch.Size([1, 1, 29042, 29042]) Position ids shape: torch.Size([1, 29042]) Input IDs shape: torch.Size([1, 29042]) Labels shape: torch.Size([1, 29042]) Final batch size: 1, sequence length: 38529 Attention mask shape: torch.Size([1, 1, 38529, 38529]) Position ids shape: torch.Size([1, 38529]) Input IDs shape: torch.Size([1, 38529]) Labels shape: torch.Size([1, 38529]) Final batch size: 1, sequence length: 26922 Attention mask shape: torch.Size([1, 1, 26922, 26922]) Position ids shape: torch.Size([1, 26922]) Input IDs shape: torch.Size([1, 26922]) Labels shape: torch.Size([1, 26922]) Final batch size: 1, sequence length: 31449 Attention mask shape: torch.Size([1, 1, 31449, 31449]) Position ids shape: torch.Size([1, 31449]) Input IDs shape: torch.Size([1, 31449]) Labels shape: torch.Size([1, 31449]) Final batch size: 1, sequence length: 17758 Attention mask shape: torch.Size([1, 1, 17758, 17758]) Position ids shape: torch.Size([1, 17758]) Input IDs shape: torch.Size([1, 17758]) Labels shape: torch.Size([1, 17758]) Final batch size: 1, sequence length: 19287 Attention mask shape: torch.Size([1, 1, 19287, 19287]) Position ids shape: torch.Size([1, 19287]) Input IDs shape: torch.Size([1, 19287]) Labels shape: torch.Size([1, 19287]) Final batch size: 1, sequence length: 39169 Attention mask shape: torch.Size([1, 1, 39169, 39169]) Position ids shape: torch.Size([1, 39169]) Input IDs shape: torch.Size([1, 39169]) Labels shape: torch.Size([1, 39169]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40752 Attention mask shape: torch.Size([1, 1, 40752, 40752]) Position ids shape: torch.Size([1, 40752]) Input IDs shape: torch.Size([1, 40752]) Labels shape: torch.Size([1, 40752]) Final batch size: 1, sequence length: 39258 Attention mask shape: torch.Size([1, 1, 39258, 39258]) Position ids shape: torch.Size([1, 39258]) Input IDs shape: torch.Size([1, 39258]) Labels shape: torch.Size([1, 39258]) Final batch size: 1, sequence length: 17535 Attention mask shape: torch.Size([1, 1, 17535, 17535]) Position ids shape: torch.Size([1, 17535]) Input IDs shape: torch.Size([1, 17535]) Labels shape: torch.Size([1, 17535]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 26893 Attention mask shape: torch.Size([1, 1, 26893, 26893]) Position ids shape: torch.Size([1, 26893]) Input IDs shape: torch.Size([1, 26893]) Labels shape: torch.Size([1, 26893]) Final batch size: 1, sequence length: 38891 Attention mask shape: torch.Size([1, 1, 38891, 38891]) Position ids shape: torch.Size([1, 38891]) Input IDs shape: torch.Size([1, 38891]) Labels shape: torch.Size([1, 38891]) Final batch size: 1, sequence length: 32638 Attention mask shape: torch.Size([1, 1, 32638, 32638]) Position ids shape: torch.Size([1, 32638]) Input IDs shape: torch.Size([1, 32638]) Labels shape: torch.Size([1, 32638]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 17890 Attention mask shape: torch.Size([1, 1, 17890, 17890]) Position ids shape: torch.Size([1, 17890]) Input IDs shape: torch.Size([1, 17890]) Labels shape: torch.Size([1, 17890]) Final batch size: 1, sequence length: 39953 Attention mask shape: torch.Size([1, 1, 39953, 39953]) Position ids shape: torch.Size([1, 39953]) Input IDs shape: torch.Size([1, 39953]) Labels shape: torch.Size([1, 39953]) Final batch size: 1, sequence length: 32465 Attention mask shape: torch.Size([1, 1, 32465, 32465]) Position ids shape: torch.Size([1, 32465]) Input IDs shape: torch.Size([1, 32465]) Labels shape: torch.Size([1, 32465]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 18471 Attention mask shape: torch.Size([1, 1, 18471, 18471]) Position ids shape: torch.Size([1, 18471]) Input IDs shape: torch.Size([1, 18471]) Labels shape: torch.Size([1, 18471]) Final batch size: 1, sequence length: 36599 Attention mask shape: torch.Size([1, 1, 36599, 36599]) Position ids shape: torch.Size([1, 36599]) Input IDs shape: torch.Size([1, 36599]) Labels shape: torch.Size([1, 36599]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) {'loss': 0.4381, 'grad_norm': 2.0873407523448138, 'learning_rate': 5e-06, 'num_tokens': -inf, 'epoch': 0.38} Final batch size: 1, sequence length: 5818 Attention mask shape: torch.Size([1, 1, 5818, 5818]) Position ids shape: torch.Size([1, 5818]) Input IDs shape: torch.Size([1, 5818]) Labels shape: torch.Size([1, 5818]) Final batch size: 1, sequence length: 6215 Attention mask shape: torch.Size([1, 1, 6215, 6215]) Position ids shape: torch.Size([1, 6215]) Input IDs shape: torch.Size([1, 6215]) Labels shape: torch.Size([1, 6215]) Final batch size: 1, sequence length: 7977 Attention mask shape: torch.Size([1, 1, 7977, 7977]) Position ids shape: torch.Size([1, 7977]) Input IDs shape: torch.Size([1, 7977]) Labels shape: torch.Size([1, 7977]) Final batch size: 1, sequence length: 10107 Attention mask shape: torch.Size([1, 1, 10107, 10107]) Position ids shape: torch.Size([1, 10107]) Input IDs shape: torch.Size([1, 10107]) Labels shape: torch.Size([1, 10107]) Final batch size: 1, sequence length: 6095 Attention mask shape: torch.Size([1, 1, 6095, 6095]) Position ids shape: torch.Size([1, 6095]) Input IDs shape: torch.Size([1, 6095]) Labels shape: torch.Size([1, 6095]) Final batch size: 1, sequence length: 8500 Attention mask shape: torch.Size([1, 1, 8500, 8500]) Position ids shape: torch.Size([1, 8500]) Input IDs shape: torch.Size([1, 8500]) Labels shape: torch.Size([1, 8500]) Final batch size: 1, sequence length: 10080 Attention mask shape: torch.Size([1, 1, 10080, 10080]) Position ids shape: torch.Size([1, 10080]) Input IDs shape: torch.Size([1, 10080]) Labels shape: torch.Size([1, 10080]) Final batch size: 1, sequence length: 12454 Attention mask shape: torch.Size([1, 1, 12454, 12454]) Position ids shape: torch.Size([1, 12454]) Input IDs shape: torch.Size([1, 12454]) Labels shape: torch.Size([1, 12454]) Final batch size: 1, sequence length: 12826 Attention mask shape: torch.Size([1, 1, 12826, 12826]) Position ids shape: torch.Size([1, 12826]) Input IDs shape: torch.Size([1, 12826]) Labels shape: torch.Size([1, 12826]) Final batch size: 1, sequence length: 12928 Attention mask shape: torch.Size([1, 1, 12928, 12928]) Position ids shape: torch.Size([1, 12928]) Input IDs shape: torch.Size([1, 12928]) Labels shape: torch.Size([1, 12928]) Final batch size: 1, sequence length: 13624 Attention mask shape: torch.Size([1, 1, 13624, 13624]) Position ids shape: torch.Size([1, 13624]) Input IDs shape: torch.Size([1, 13624]) Labels shape: torch.Size([1, 13624]) Final batch size: 1, sequence length: 9217 Attention mask shape: torch.Size([1, 1, 9217, 9217]) Position ids shape: torch.Size([1, 9217]) Input IDs shape: torch.Size([1, 9217]) Labels shape: torch.Size([1, 9217]) Final batch size: 1, sequence length: 13092 Attention mask shape: torch.Size([1, 1, 13092, 13092]) Position ids shape: torch.Size([1, 13092]) Input IDs shape: torch.Size([1, 13092]) Labels shape: torch.Size([1, 13092]) Final batch size: 1, sequence length: 13459 Attention mask shape: torch.Size([1, 1, 13459, 13459]) Position ids shape: torch.Size([1, 13459]) Input IDs shape: torch.Size([1, 13459]) Labels shape: torch.Size([1, 13459]) Final batch size: 1, sequence length: 12556 Attention mask shape: torch.Size([1, 1, 12556, 12556]) Position ids shape: torch.Size([1, 12556]) Input IDs shape: torch.Size([1, 12556]) Labels shape: torch.Size([1, 12556]) Final batch size: 1, sequence length: 10505 Attention mask shape: torch.Size([1, 1, 10505, 10505]) Position ids shape: torch.Size([1, 10505]) Input IDs shape: torch.Size([1, 10505]) Labels shape: torch.Size([1, 10505]) Final batch size: 1, sequence length: 9379 Attention mask shape: torch.Size([1, 1, 9379, 9379]) Position ids shape: torch.Size([1, 9379]) Input IDs shape: torch.Size([1, 9379]) Labels shape: torch.Size([1, 9379]) Final batch size: 1, sequence length: 10469 Attention mask shape: torch.Size([1, 1, 10469, 10469]) Position ids shape: torch.Size([1, 10469]) Input IDs shape: torch.Size([1, 10469]) Labels shape: torch.Size([1, 10469]) Final batch size: 1, sequence length: 17417 Attention mask shape: torch.Size([1, 1, 17417, 17417]) Position ids shape: torch.Size([1, 17417]) Input IDs shape: torch.Size([1, 17417]) Labels shape: torch.Size([1, 17417]) Final batch size: 1, sequence length: 10687 Attention mask shape: torch.Size([1, 1, 10687, 10687]) Position ids shape: torch.Size([1, 10687]) Input IDs shape: torch.Size([1, 10687]) Labels shape: torch.Size([1, 10687]) Final batch size: 1, sequence length: 10277 Attention mask shape: torch.Size([1, 1, 10277, 10277]) Position ids shape: torch.Size([1, 10277]) Input IDs shape: torch.Size([1, 10277]) Labels shape: torch.Size([1, 10277]) Final batch size: 1, sequence length: 16782 Attention mask shape: torch.Size([1, 1, 16782, 16782]) Position ids shape: torch.Size([1, 16782]) Input IDs shape: torch.Size([1, 16782]) Labels shape: torch.Size([1, 16782]) Final batch size: 1, sequence length: 17980 Attention mask shape: torch.Size([1, 1, 17980, 17980]) Position ids shape: torch.Size([1, 17980]) Input IDs shape: torch.Size([1, 17980]) Labels shape: torch.Size([1, 17980]) Final batch size: 1, sequence length: 18454 Attention mask shape: torch.Size([1, 1, 18454, 18454]) Position ids shape: torch.Size([1, 18454]) Input IDs shape: torch.Size([1, 18454]) Labels shape: torch.Size([1, 18454]) Final batch size: 1, sequence length: 19671 Attention mask shape: torch.Size([1, 1, 19671, 19671]) Position ids shape: torch.Size([1, 19671]) Input IDs shape: torch.Size([1, 19671]) Labels shape: torch.Size([1, 19671]) Final batch size: 1, sequence length: 15185 Attention mask shape: torch.Size([1, 1, 15185, 15185]) Position ids shape: torch.Size([1, 15185]) Input IDs shape: torch.Size([1, 15185]) Labels shape: torch.Size([1, 15185]) Final batch size: 1, sequence length: 19683 Attention mask shape: torch.Size([1, 1, 19683, 19683]) Position ids shape: torch.Size([1, 19683]) Input IDs shape: torch.Size([1, 19683]) Labels shape: torch.Size([1, 19683]) Final batch size: 1, sequence length: 20243 Attention mask shape: torch.Size([1, 1, 20243, 20243]) Position ids shape: torch.Size([1, 20243]) Input IDs shape: torch.Size([1, 20243]) Labels shape: torch.Size([1, 20243]) Final batch size: 1, sequence length: 20191 Attention mask shape: torch.Size([1, 1, 20191, 20191]) Position ids shape: torch.Size([1, 20191]) Input IDs shape: torch.Size([1, 20191]) Labels shape: torch.Size([1, 20191]) Final batch size: 1, sequence length: 18469 Attention mask shape: torch.Size([1, 1, 18469, 18469]) Position ids shape: torch.Size([1, 18469]) Input IDs shape: torch.Size([1, 18469]) Labels shape: torch.Size([1, 18469]) Final batch size: 1, sequence length: 19187 Attention mask shape: torch.Size([1, 1, 19187, 19187]) Position ids shape: torch.Size([1, 19187]) Input IDs shape: torch.Size([1, 19187]) Labels shape: torch.Size([1, 19187]) Final batch size: 1, sequence length: 10390 Attention mask shape: torch.Size([1, 1, 10390, 10390]) Position ids shape: torch.Size([1, 10390]) Input IDs shape: torch.Size([1, 10390]) Labels shape: torch.Size([1, 10390]) Final batch size: 1, sequence length: 21051 Attention mask shape: torch.Size([1, 1, 21051, 21051]) Position ids shape: torch.Size([1, 21051]) Input IDs shape: torch.Size([1, 21051]) Labels shape: torch.Size([1, 21051]) Final batch size: 1, sequence length: 15263 Attention mask shape: torch.Size([1, 1, 15263, 15263]) Position ids shape: torch.Size([1, 15263]) Input IDs shape: torch.Size([1, 15263]) Labels shape: torch.Size([1, 15263]) Final batch size: 1, sequence length: 10182 Attention mask shape: torch.Size([1, 1, 10182, 10182]) Position ids shape: torch.Size([1, 10182]) Input IDs shape: torch.Size([1, 10182]) Labels shape: torch.Size([1, 10182]) Final batch size: 1, sequence length: 20432 Attention mask shape: torch.Size([1, 1, 20432, 20432]) Position ids shape: torch.Size([1, 20432]) Input IDs shape: torch.Size([1, 20432]) Labels shape: torch.Size([1, 20432]) Final batch size: 1, sequence length: 12012 Attention mask shape: torch.Size([1, 1, 12012, 12012]) Position ids shape: torch.Size([1, 12012]) Input IDs shape: torch.Size([1, 12012]) Labels shape: torch.Size([1, 12012]) Final batch size: 1, sequence length: 22133 Attention mask shape: torch.Size([1, 1, 22133, 22133]) Position ids shape: torch.Size([1, 22133]) Input IDs shape: torch.Size([1, 22133]) Labels shape: torch.Size([1, 22133]) Final batch size: 1, sequence length: 22857 Attention mask shape: torch.Size([1, 1, 22857, 22857]) Position ids shape: torch.Size([1, 22857]) Input IDs shape: torch.Size([1, 22857]) Labels shape: torch.Size([1, 22857]) Final batch size: 1, sequence length: 16585 Attention mask shape: torch.Size([1, 1, 16585, 16585]) Position ids shape: torch.Size([1, 16585]) Input IDs shape: torch.Size([1, 16585]) Labels shape: torch.Size([1, 16585]) Final batch size: 1, sequence length: 23238 Attention mask shape: torch.Size([1, 1, 23238, 23238]) Position ids shape: torch.Size([1, 23238]) Input IDs shape: torch.Size([1, 23238]) Labels shape: torch.Size([1, 23238]) Final batch size: 1, sequence length: 15339 Attention mask shape: torch.Size([1, 1, 15339, 15339]) Position ids shape: torch.Size([1, 15339]) Input IDs shape: torch.Size([1, 15339]) Labels shape: torch.Size([1, 15339]) Final batch size: 1, sequence length: 24002 Attention mask shape: torch.Size([1, 1, 24002, 24002]) Position ids shape: torch.Size([1, 24002]) Input IDs shape: torch.Size([1, 24002]) Labels shape: torch.Size([1, 24002]) Final batch size: 1, sequence length: 20562 Attention mask shape: torch.Size([1, 1, 20562, 20562]) Position ids shape: torch.Size([1, 20562]) Input IDs shape: torch.Size([1, 20562]) Labels shape: torch.Size([1, 20562]) Final batch size: 1, sequence length: 14892 Attention mask shape: torch.Size([1, 1, 14892, 14892]) Position ids shape: torch.Size([1, 14892]) Input IDs shape: torch.Size([1, 14892]) Labels shape: torch.Size([1, 14892]) Final batch size: 1, sequence length: 24428 Attention mask shape: torch.Size([1, 1, 24428, 24428]) Position ids shape: torch.Size([1, 24428]) Input IDs shape: torch.Size([1, 24428]) Labels shape: torch.Size([1, 24428]) Final batch size: 1, sequence length: 25707 Attention mask shape: torch.Size([1, 1, 25707, 25707]) Position ids shape: torch.Size([1, 25707]) Input IDs shape: torch.Size([1, 25707]) Labels shape: torch.Size([1, 25707]) Final batch size: 1, sequence length: 14556 Attention mask shape: torch.Size([1, 1, 14556, 14556]) Position ids shape: torch.Size([1, 14556]) Input IDs shape: torch.Size([1, 14556]) Labels shape: torch.Size([1, 14556]) Final batch size: 1, sequence length: 24858 Attention mask shape: torch.Size([1, 1, 24858, 24858]) Position ids shape: torch.Size([1, 24858]) Input IDs shape: torch.Size([1, 24858]) Labels shape: torch.Size([1, 24858]) Final batch size: 1, sequence length: 24255 Attention mask shape: torch.Size([1, 1, 24255, 24255]) Position ids shape: torch.Size([1, 24255]) Input IDs shape: torch.Size([1, 24255]) Labels shape: torch.Size([1, 24255]) Final batch size: 1, sequence length: 26033 Attention mask shape: torch.Size([1, 1, 26033, 26033]) Position ids shape: torch.Size([1, 26033]) Input IDs shape: torch.Size([1, 26033]) Labels shape: torch.Size([1, 26033]) Final batch size: 1, sequence length: 24769 Attention mask shape: torch.Size([1, 1, 24769, 24769]) Position ids shape: torch.Size([1, 24769]) Input IDs shape: torch.Size([1, 24769]) Labels shape: torch.Size([1, 24769]) Final batch size: 1, sequence length: 26144 Attention mask shape: torch.Size([1, 1, 26144, 26144]) Position ids shape: torch.Size([1, 26144]) Input IDs shape: torch.Size([1, 26144]) Labels shape: torch.Size([1, 26144]) Final batch size: 1, sequence length: 21664 Attention mask shape: torch.Size([1, 1, 21664, 21664]) Position ids shape: torch.Size([1, 21664]) Input IDs shape: torch.Size([1, 21664]) Labels shape: torch.Size([1, 21664]) Final batch size: 1, sequence length: 26312 Attention mask shape: torch.Size([1, 1, 26312, 26312]) Position ids shape: torch.Size([1, 26312]) Input IDs shape: torch.Size([1, 26312]) Labels shape: torch.Size([1, 26312]) Final batch size: 1, sequence length: 16541 Attention mask shape: torch.Size([1, 1, 16541, 16541]) Position ids shape: torch.Size([1, 16541]) Input IDs shape: torch.Size([1, 16541]) Labels shape: torch.Size([1, 16541]) Final batch size: 1, sequence length: 25886 Attention mask shape: torch.Size([1, 1, 25886, 25886]) Position ids shape: torch.Size([1, 25886]) Input IDs shape: torch.Size([1, 25886]) Labels shape: torch.Size([1, 25886]) Final batch size: 1, sequence length: 25758 Attention mask shape: torch.Size([1, 1, 25758, 25758]) Position ids shape: torch.Size([1, 25758]) Input IDs shape: torch.Size([1, 25758]) Labels shape: torch.Size([1, 25758]) Final batch size: 1, sequence length: 23602 Attention mask shape: torch.Size([1, 1, 23602, 23602]) Position ids shape: torch.Size([1, 23602]) Input IDs shape: torch.Size([1, 23602]) Labels shape: torch.Size([1, 23602]) Final batch size: 1, sequence length: 30079 Attention mask shape: torch.Size([1, 1, 30079, 30079]) Position ids shape: torch.Size([1, 30079]) Input IDs shape: torch.Size([1, 30079]) Labels shape: torch.Size([1, 30079]) Final batch size: 1, sequence length: 29948 Attention mask shape: torch.Size([1, 1, 29948, 29948]) Position ids shape: torch.Size([1, 29948]) Input IDs shape: torch.Size([1, 29948]) Labels shape: torch.Size([1, 29948]) Final batch size: 1, sequence length: 10318 Attention mask shape: torch.Size([1, 1, 10318, 10318]) Position ids shape: torch.Size([1, 10318]) Input IDs shape: torch.Size([1, 10318]) Labels shape: torch.Size([1, 10318]) Final batch size: 1, sequence length: 32541 Attention mask shape: torch.Size([1, 1, 32541, 32541]) Position ids shape: torch.Size([1, 32541]) Input IDs shape: torch.Size([1, 32541]) Labels shape: torch.Size([1, 32541]) Final batch size: 1, sequence length: 19238 Attention mask shape: torch.Size([1, 1, 19238, 19238]) Position ids shape: torch.Size([1, 19238]) Input IDs shape: torch.Size([1, 19238]) Labels shape: torch.Size([1, 19238]) Final batch size: 1, sequence length: 23698 Attention mask shape: torch.Size([1, 1, 23698, 23698]) Position ids shape: torch.Size([1, 23698]) Input IDs shape: torch.Size([1, 23698]) Labels shape: torch.Size([1, 23698]) Final batch size: 1, sequence length: 29749 Attention mask shape: torch.Size([1, 1, 29749, 29749]) Position ids shape: torch.Size([1, 29749]) Input IDs shape: torch.Size([1, 29749]) Labels shape: torch.Size([1, 29749]) Final batch size: 1, sequence length: 25719 Attention mask shape: torch.Size([1, 1, 25719, 25719]) Position ids shape: torch.Size([1, 25719]) Input IDs shape: torch.Size([1, 25719]) Labels shape: torch.Size([1, 25719]) Final batch size: 1, sequence length: 29825 Attention mask shape: torch.Size([1, 1, 29825, 29825]) Position ids shape: torch.Size([1, 29825]) Input IDs shape: torch.Size([1, 29825]) Labels shape: torch.Size([1, 29825]) Final batch size: 1, sequence length: 21374 Attention mask shape: torch.Size([1, 1, 21374, 21374]) Position ids shape: torch.Size([1, 21374]) Input IDs shape: torch.Size([1, 21374]) Labels shape: torch.Size([1, 21374]) Final batch size: 1, sequence length: 32071 Attention mask shape: torch.Size([1, 1, 32071, 32071]) Position ids shape: torch.Size([1, 32071]) Input IDs shape: torch.Size([1, 32071]) Labels shape: torch.Size([1, 32071]) Final batch size: 1, sequence length: 25492 Attention mask shape: torch.Size([1, 1, 25492, 25492]) Position ids shape: torch.Size([1, 25492]) Input IDs shape: torch.Size([1, 25492]) Labels shape: torch.Size([1, 25492]) Final batch size: 1, sequence length: 15604 Attention mask shape: torch.Size([1, 1, 15604, 15604]) Position ids shape: torch.Size([1, 15604]) Input IDs shape: torch.Size([1, 15604]) Labels shape: torch.Size([1, 15604]) Final batch size: 1, sequence length: 22139 Attention mask shape: torch.Size([1, 1, 22139, 22139]) Position ids shape: torch.Size([1, 22139]) Input IDs shape: torch.Size([1, 22139]) Labels shape: torch.Size([1, 22139]) Final batch size: 1, sequence length: 25656 Attention mask shape: torch.Size([1, 1, 25656, 25656]) Position ids shape: torch.Size([1, 25656]) Input IDs shape: torch.Size([1, 25656]) Labels shape: torch.Size([1, 25656]) Final batch size: 1, sequence length: 33611 Attention mask shape: torch.Size([1, 1, 33611, 33611]) Position ids shape: torch.Size([1, 33611]) Input IDs shape: torch.Size([1, 33611]) Labels shape: torch.Size([1, 33611]) Final batch size: 1, sequence length: 34701 Attention mask shape: torch.Size([1, 1, 34701, 34701]) Position ids shape: torch.Size([1, 34701]) Input IDs shape: torch.Size([1, 34701]) Labels shape: torch.Size([1, 34701]) Final batch size: 1, sequence length: 23343 Attention mask shape: torch.Size([1, 1, 23343, 23343]) Position ids shape: torch.Size([1, 23343]) Input IDs shape: torch.Size([1, 23343]) Labels shape: torch.Size([1, 23343]) Final batch size: 1, sequence length: 36778 Attention mask shape: torch.Size([1, 1, 36778, 36778]) Position ids shape: torch.Size([1, 36778]) Input IDs shape: torch.Size([1, 36778]) Labels shape: torch.Size([1, 36778]) Final batch size: 1, sequence length: 9897 Attention mask shape: torch.Size([1, 1, 9897, 9897]) Position ids shape: torch.Size([1, 9897]) Input IDs shape: torch.Size([1, 9897]) Labels shape: torch.Size([1, 9897]) Final batch size: 1, sequence length: 34186 Attention mask shape: torch.Size([1, 1, 34186, 34186]) Position ids shape: torch.Size([1, 34186]) Input IDs shape: torch.Size([1, 34186]) Labels shape: torch.Size([1, 34186]) Final batch size: 1, sequence length: 37241 Attention mask shape: torch.Size([1, 1, 37241, 37241]) Position ids shape: torch.Size([1, 37241]) Input IDs shape: torch.Size([1, 37241]) Labels shape: torch.Size([1, 37241]) Final batch size: 1, sequence length: 27283 Attention mask shape: torch.Size([1, 1, 27283, 27283]) Position ids shape: torch.Size([1, 27283]) Input IDs shape: torch.Size([1, 27283]) Labels shape: torch.Size([1, 27283]) Final batch size: 1, sequence length: 35760 Attention mask shape: torch.Size([1, 1, 35760, 35760]) Position ids shape: torch.Size([1, 35760]) Input IDs shape: torch.Size([1, 35760]) Labels shape: torch.Size([1, 35760]) Final batch size: 1, sequence length: 23399 Attention mask shape: torch.Size([1, 1, 23399, 23399]) Position ids shape: torch.Size([1, 23399]) Input IDs shape: torch.Size([1, 23399]) Labels shape: torch.Size([1, 23399]) Final batch size: 1, sequence length: 21557 Attention mask shape: torch.Size([1, 1, 21557, 21557]) Position ids shape: torch.Size([1, 21557]) Input IDs shape: torch.Size([1, 21557]) Labels shape: torch.Size([1, 21557]) Final batch size: 1, sequence length: 37728 Attention mask shape: torch.Size([1, 1, 37728, 37728]) Position ids shape: torch.Size([1, 37728]) Input IDs shape: torch.Size([1, 37728]) Labels shape: torch.Size([1, 37728]) Final batch size: 1, sequence length: 40593 Attention mask shape: torch.Size([1, 1, 40593, 40593]) Position ids shape: torch.Size([1, 40593]) Input IDs shape: torch.Size([1, 40593]) Labels shape: torch.Size([1, 40593]) Final batch size: 1, sequence length: 36456 Attention mask shape: torch.Size([1, 1, 36456, 36456]) Position ids shape: torch.Size([1, 36456]) Input IDs shape: torch.Size([1, 36456]) Labels shape: torch.Size([1, 36456]) Final batch size: 1, sequence length: 33459 Attention mask shape: torch.Size([1, 1, 33459, 33459]) Position ids shape: torch.Size([1, 33459]) Input IDs shape: torch.Size([1, 33459]) Labels shape: torch.Size([1, 33459]) Final batch size: 1, sequence length: 31650 Attention mask shape: torch.Size([1, 1, 31650, 31650]) Position ids shape: torch.Size([1, 31650]) Input IDs shape: torch.Size([1, 31650]) Labels shape: torch.Size([1, 31650]) Final batch size: 1, sequence length: 7681 Attention mask shape: torch.Size([1, 1, 7681, 7681]) Position ids shape: torch.Size([1, 7681]) Input IDs shape: torch.Size([1, 7681]) Labels shape: torch.Size([1, 7681]) Final batch size: 1, sequence length: 39760 Attention mask shape: torch.Size([1, 1, 39760, 39760]) Position ids shape: torch.Size([1, 39760]) Input IDs shape: torch.Size([1, 39760]) Labels shape: torch.Size([1, 39760]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 31323 Attention mask shape: torch.Size([1, 1, 31323, 31323]) Position ids shape: torch.Size([1, 31323]) Input IDs shape: torch.Size([1, 31323]) Labels shape: torch.Size([1, 31323]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 21955 Attention mask shape: torch.Size([1, 1, 21955, 21955]) Position ids shape: torch.Size([1, 21955]) Input IDs shape: torch.Size([1, 21955]) Labels shape: torch.Size([1, 21955]) Final batch size: 1, sequence length: 40507 Attention mask shape: torch.Size([1, 1, 40507, 40507]) Position ids shape: torch.Size([1, 40507]) Input IDs shape: torch.Size([1, 40507]) Labels shape: torch.Size([1, 40507]) Final batch size: 1, sequence length: 40147 Attention mask shape: torch.Size([1, 1, 40147, 40147]) Position ids shape: torch.Size([1, 40147]) Input IDs shape: torch.Size([1, 40147]) Labels shape: torch.Size([1, 40147]) Final batch size: 1, sequence length: 15294 Attention mask shape: torch.Size([1, 1, 15294, 15294]) Position ids shape: torch.Size([1, 15294]) Input IDs shape: torch.Size([1, 15294]) Labels shape: torch.Size([1, 15294]) Final batch size: 1, sequence length: 9445 Attention mask shape: torch.Size([1, 1, 9445, 9445]) Position ids shape: torch.Size([1, 9445]) Input IDs shape: torch.Size([1, 9445]) Labels shape: torch.Size([1, 9445]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 29082 Attention mask shape: torch.Size([1, 1, 29082, 29082]) Position ids shape: torch.Size([1, 29082]) Input IDs shape: torch.Size([1, 29082]) Labels shape: torch.Size([1, 29082]) Final batch size: 1, sequence length: 22434 Attention mask shape: torch.Size([1, 1, 22434, 22434]) Position ids shape: torch.Size([1, 22434]) Input IDs shape: torch.Size([1, 22434]) Labels shape: torch.Size([1, 22434]) Final batch size: 1, sequence length: 26664 Attention mask shape: torch.Size([1, 1, 26664, 26664]) Position ids shape: torch.Size([1, 26664]) Input IDs shape: torch.Size([1, 26664]) Labels shape: torch.Size([1, 26664]) Final batch size: 1, sequence length: 22491 Attention mask shape: torch.Size([1, 1, 22491, 22491]) Position ids shape: torch.Size([1, 22491]) Input IDs shape: torch.Size([1, 22491]) Labels shape: torch.Size([1, 22491]) Final batch size: 1, sequence length: 37775 Attention mask shape: torch.Size([1, 1, 37775, 37775]) Position ids shape: torch.Size([1, 37775]) Input IDs shape: torch.Size([1, 37775]) Labels shape: torch.Size([1, 37775]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 37183 Attention mask shape: torch.Size([1, 1, 37183, 37183]) Position ids shape: torch.Size([1, 37183]) Input IDs shape: torch.Size([1, 37183]) Labels shape: torch.Size([1, 37183]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 19867 Attention mask shape: torch.Size([1, 1, 19867, 19867]) Position ids shape: torch.Size([1, 19867]) Input IDs shape: torch.Size([1, 19867]) Labels shape: torch.Size([1, 19867]) Final batch size: 1, sequence length: 16273 Attention mask shape: torch.Size([1, 1, 16273, 16273]) Position ids shape: torch.Size([1, 16273]) Input IDs shape: torch.Size([1, 16273]) Labels shape: torch.Size([1, 16273]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 21489 Attention mask shape: torch.Size([1, 1, 21489, 21489]) Position ids shape: torch.Size([1, 21489]) Input IDs shape: torch.Size([1, 21489]) Labels shape: torch.Size([1, 21489]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 17646 Attention mask shape: torch.Size([1, 1, 17646, 17646]) Position ids shape: torch.Size([1, 17646]) Input IDs shape: torch.Size([1, 17646]) Labels shape: torch.Size([1, 17646]) Final batch size: 1, sequence length: 36579 Attention mask shape: torch.Size([1, 1, 36579, 36579]) Position ids shape: torch.Size([1, 36579]) Input IDs shape: torch.Size([1, 36579]) Labels shape: torch.Size([1, 36579]) Final batch size: 1, sequence length: 38686 Attention mask shape: torch.Size([1, 1, 38686, 38686]) Position ids shape: torch.Size([1, 38686]) Input IDs shape: torch.Size([1, 38686]) Labels shape: torch.Size([1, 38686]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 28661 Attention mask shape: torch.Size([1, 1, 28661, 28661]) Position ids shape: torch.Size([1, 28661]) Input IDs shape: torch.Size([1, 28661]) Labels shape: torch.Size([1, 28661]) Final batch size: 1, sequence length: 37091 Attention mask shape: torch.Size([1, 1, 37091, 37091]) Position ids shape: torch.Size([1, 37091]) Input IDs shape: torch.Size([1, 37091]) Labels shape: torch.Size([1, 37091]) Final batch size: 1, sequence length: 33263 Attention mask shape: torch.Size([1, 1, 33263, 33263]) Position ids shape: torch.Size([1, 33263]) Input IDs shape: torch.Size([1, 33263]) Labels shape: torch.Size([1, 33263]) Final batch size: 1, sequence length: 23014 Attention mask shape: torch.Size([1, 1, 23014, 23014]) Position ids shape: torch.Size([1, 23014]) Input IDs shape: torch.Size([1, 23014]) Labels shape: torch.Size([1, 23014]) Final batch size: 1, sequence length: 25914 Attention mask shape: torch.Size([1, 1, 25914, 25914]) Position ids shape: torch.Size([1, 25914]) Input IDs shape: torch.Size([1, 25914]) Labels shape: torch.Size([1, 25914]) Final batch size: 1, sequence length: 30364 Attention mask shape: torch.Size([1, 1, 30364, 30364]) Position ids shape: torch.Size([1, 30364]) Input IDs shape: torch.Size([1, 30364]) Labels shape: torch.Size([1, 30364]) Final batch size: 1, sequence length: 36297 Attention mask shape: torch.Size([1, 1, 36297, 36297]) Position ids shape: torch.Size([1, 36297]) Input IDs shape: torch.Size([1, 36297]) Labels shape: torch.Size([1, 36297]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) {'loss': 0.4346, 'grad_norm': 1.8462684093012405, 'learning_rate': 7.500000000000001e-06, 'num_tokens': -inf, 'epoch': 0.5} Final batch size: 1, sequence length: 6519 Attention mask shape: torch.Size([1, 1, 6519, 6519]) Position ids shape: torch.Size([1, 6519]) Input IDs shape: torch.Size([1, 6519]) Labels shape: torch.Size([1, 6519]) Final batch size: 1, sequence length: 5525 Attention mask shape: torch.Size([1, 1, 5525, 5525]) Position ids shape: torch.Size([1, 5525]) Input IDs shape: torch.Size([1, 5525]) Labels shape: torch.Size([1, 5525]) Final batch size: 1, sequence length: 10273 Attention mask shape: torch.Size([1, 1, 10273, 10273]) Position ids shape: torch.Size([1, 10273]) Input IDs shape: torch.Size([1, 10273]) Labels shape: torch.Size([1, 10273]) Final batch size: 1, sequence length: 9181 Attention mask shape: torch.Size([1, 1, 9181, 9181]) Position ids shape: torch.Size([1, 9181]) Input IDs shape: torch.Size([1, 9181]) Labels shape: torch.Size([1, 9181]) Final batch size: 1, sequence length: 10408 Attention mask shape: torch.Size([1, 1, 10408, 10408]) Position ids shape: torch.Size([1, 10408]) Input IDs shape: torch.Size([1, 10408]) Labels shape: torch.Size([1, 10408]) Final batch size: 1, sequence length: 12281 Attention mask shape: torch.Size([1, 1, 12281, 12281]) Position ids shape: torch.Size([1, 12281]) Input IDs shape: torch.Size([1, 12281]) Labels shape: torch.Size([1, 12281]) Final batch size: 1, sequence length: 12927 Attention mask shape: torch.Size([1, 1, 12927, 12927]) Position ids shape: torch.Size([1, 12927]) Input IDs shape: torch.Size([1, 12927]) Labels shape: torch.Size([1, 12927]) Final batch size: 1, sequence length: 13363 Attention mask shape: torch.Size([1, 1, 13363, 13363]) Position ids shape: torch.Size([1, 13363]) Input IDs shape: torch.Size([1, 13363]) Labels shape: torch.Size([1, 13363]) Final batch size: 1, sequence length: 10905 Attention mask shape: torch.Size([1, 1, 10905, 10905]) Position ids shape: torch.Size([1, 10905]) Input IDs shape: torch.Size([1, 10905]) Labels shape: torch.Size([1, 10905]) Final batch size: 1, sequence length: 13804 Attention mask shape: torch.Size([1, 1, 13804, 13804]) Position ids shape: torch.Size([1, 13804]) Input IDs shape: torch.Size([1, 13804]) Labels shape: torch.Size([1, 13804]) Final batch size: 1, sequence length: 15257 Attention mask shape: torch.Size([1, 1, 15257, 15257]) Position ids shape: torch.Size([1, 15257]) Input IDs shape: torch.Size([1, 15257]) Labels shape: torch.Size([1, 15257]) Final batch size: 1, sequence length: 13385 Attention mask shape: torch.Size([1, 1, 13385, 13385]) Position ids shape: torch.Size([1, 13385]) Input IDs shape: torch.Size([1, 13385]) Labels shape: torch.Size([1, 13385]) Final batch size: 1, sequence length: 15518 Attention mask shape: torch.Size([1, 1, 15518, 15518]) Position ids shape: torch.Size([1, 15518]) Input IDs shape: torch.Size([1, 15518]) Labels shape: torch.Size([1, 15518]) Final batch size: 1, sequence length: 16520 Attention mask shape: torch.Size([1, 1, 16520, 16520]) Position ids shape: torch.Size([1, 16520]) Input IDs shape: torch.Size([1, 16520]) Labels shape: torch.Size([1, 16520]) Final batch size: 1, sequence length: 15244 Attention mask shape: torch.Size([1, 1, 15244, 15244]) Position ids shape: torch.Size([1, 15244]) Input IDs shape: torch.Size([1, 15244]) Labels shape: torch.Size([1, 15244]) Final batch size: 1, sequence length: 17003 Attention mask shape: torch.Size([1, 1, 17003, 17003]) Position ids shape: torch.Size([1, 17003]) Input IDs shape: torch.Size([1, 17003]) Labels shape: torch.Size([1, 17003]) Final batch size: 1, sequence length: 8646 Attention mask shape: torch.Size([1, 1, 8646, 8646]) Position ids shape: torch.Size([1, 8646]) Input IDs shape: torch.Size([1, 8646]) Labels shape: torch.Size([1, 8646]) Final batch size: 1, sequence length: 20979 Attention mask shape: torch.Size([1, 1, 20979, 20979]) Position ids shape: torch.Size([1, 20979]) Input IDs shape: torch.Size([1, 20979]) Labels shape: torch.Size([1, 20979]) Final batch size: 1, sequence length: 19286 Attention mask shape: torch.Size([1, 1, 19286, 19286]) Position ids shape: torch.Size([1, 19286]) Input IDs shape: torch.Size([1, 19286]) Labels shape: torch.Size([1, 19286]) Final batch size: 1, sequence length: 19768 Attention mask shape: torch.Size([1, 1, 19768, 19768]) Position ids shape: torch.Size([1, 19768]) Input IDs shape: torch.Size([1, 19768]) Labels shape: torch.Size([1, 19768]) Final batch size: 1, sequence length: 20089 Attention mask shape: torch.Size([1, 1, 20089, 20089]) Position ids shape: torch.Size([1, 20089]) Input IDs shape: torch.Size([1, 20089]) Labels shape: torch.Size([1, 20089]) Final batch size: 1, sequence length: 18645 Attention mask shape: torch.Size([1, 1, 18645, 18645]) Position ids shape: torch.Size([1, 18645]) Input IDs shape: torch.Size([1, 18645]) Labels shape: torch.Size([1, 18645]) Final batch size: 1, sequence length: 18988 Final batch size: 1, sequence length: 22777 Attention mask shape: torch.Size([1, 1, 18988, 18988]) Position ids shape: torch.Size([1, 18988]) Input IDs shape: torch.Size([1, 18988]) Labels shape: torch.Size([1, 18988]) Attention mask shape: torch.Size([1, 1, 22777, 22777]) Position ids shape: torch.Size([1, 22777]) Input IDs shape: torch.Size([1, 22777]) Labels shape: torch.Size([1, 22777]) Final batch size: 1, sequence length: 23341 Attention mask shape: torch.Size([1, 1, 23341, 23341]) Position ids shape: torch.Size([1, 23341]) Input IDs shape: torch.Size([1, 23341]) Labels shape: torch.Size([1, 23341]) Final batch size: 1, sequence length: 19513 Attention mask shape: torch.Size([1, 1, 19513, 19513]) Position ids shape: torch.Size([1, 19513]) Input IDs shape: torch.Size([1, 19513]) Labels shape: torch.Size([1, 19513]) Final batch size: 1, sequence length: 23942 Attention mask shape: torch.Size([1, 1, 23942, 23942]) Position ids shape: torch.Size([1, 23942]) Input IDs shape: torch.Size([1, 23942]) Labels shape: torch.Size([1, 23942]) Final batch size: 1, sequence length: 5405 Attention mask shape: torch.Size([1, 1, 5405, 5405]) Position ids shape: torch.Size([1, 5405]) Input IDs shape: torch.Size([1, 5405]) Labels shape: torch.Size([1, 5405]) Final batch size: 1, sequence length: 21660 Attention mask shape: torch.Size([1, 1, 21660, 21660]) Position ids shape: torch.Size([1, 21660]) Input IDs shape: torch.Size([1, 21660]) Labels shape: torch.Size([1, 21660]) Final batch size: 1, sequence length: 20106 Attention mask shape: torch.Size([1, 1, 20106, 20106]) Position ids shape: torch.Size([1, 20106]) Input IDs shape: torch.Size([1, 20106]) Labels shape: torch.Size([1, 20106]) Final batch size: 1, sequence length: 23995 Attention mask shape: torch.Size([1, 1, 23995, 23995]) Position ids shape: torch.Size([1, 23995]) Input IDs shape: torch.Size([1, 23995]) Labels shape: torch.Size([1, 23995]) Final batch size: 1, sequence length: 22915 Attention mask shape: torch.Size([1, 1, 22915, 22915]) Position ids shape: torch.Size([1, 22915]) Input IDs shape: torch.Size([1, 22915]) Labels shape: torch.Size([1, 22915]) Final batch size: 1, sequence length: 14429 Attention mask shape: torch.Size([1, 1, 14429, 14429]) Position ids shape: torch.Size([1, 14429]) Input IDs shape: torch.Size([1, 14429]) Labels shape: torch.Size([1, 14429]) Final batch size: 1, sequence length: 23886 Attention mask shape: torch.Size([1, 1, 23886, 23886]) Position ids shape: torch.Size([1, 23886]) Input IDs shape: torch.Size([1, 23886]) Labels shape: torch.Size([1, 23886]) Final batch size: 1, sequence length: 22311 Attention mask shape: torch.Size([1, 1, 22311, 22311]) Position ids shape: torch.Size([1, 22311]) Input IDs shape: torch.Size([1, 22311]) Labels shape: torch.Size([1, 22311]) Final batch size: 1, sequence length: 22979 Attention mask shape: torch.Size([1, 1, 22979, 22979]) Position ids shape: torch.Size([1, 22979]) Input IDs shape: torch.Size([1, 22979]) Labels shape: torch.Size([1, 22979]) Final batch size: 1, sequence length: 18819 Attention mask shape: torch.Size([1, 1, 18819, 18819]) Position ids shape: torch.Size([1, 18819]) Input IDs shape: torch.Size([1, 18819]) Labels shape: torch.Size([1, 18819]) Final batch size: 1, sequence length: 25001 Attention mask shape: torch.Size([1, 1, 25001, 25001]) Position ids shape: torch.Size([1, 25001]) Input IDs shape: torch.Size([1, 25001]) Labels shape: torch.Size([1, 25001]) Final batch size: 1, sequence length: 24287 Attention mask shape: torch.Size([1, 1, 24287, 24287]) Position ids shape: torch.Size([1, 24287]) Input IDs shape: torch.Size([1, 24287]) Labels shape: torch.Size([1, 24287]) Final batch size: 1, sequence length: 22160 Attention mask shape: torch.Size([1, 1, 22160, 22160]) Position ids shape: torch.Size([1, 22160]) Input IDs shape: torch.Size([1, 22160]) Labels shape: torch.Size([1, 22160]) Final batch size: 1, sequence length: 24293 Attention mask shape: torch.Size([1, 1, 24293, 24293]) Position ids shape: torch.Size([1, 24293]) Input IDs shape: torch.Size([1, 24293]) Labels shape: torch.Size([1, 24293]) Final batch size: 1, sequence length: 25021 Attention mask shape: torch.Size([1, 1, 25021, 25021]) Position ids shape: torch.Size([1, 25021]) Input IDs shape: torch.Size([1, 25021]) Labels shape: torch.Size([1, 25021]) Final batch size: 1, sequence length: 11266 Attention mask shape: torch.Size([1, 1, 11266, 11266]) Position ids shape: torch.Size([1, 11266]) Input IDs shape: torch.Size([1, 11266]) Labels shape: torch.Size([1, 11266]) Final batch size: 1, sequence length: 24407 Attention mask shape: torch.Size([1, 1, 24407, 24407]) Position ids shape: torch.Size([1, 24407]) Input IDs shape: torch.Size([1, 24407]) Labels shape: torch.Size([1, 24407]) Final batch size: 1, sequence length: 29009 Attention mask shape: torch.Size([1, 1, 29009, 29009]) Position ids shape: torch.Size([1, 29009]) Input IDs shape: torch.Size([1, 29009]) Labels shape: torch.Size([1, 29009]) Final batch size: 1, sequence length: 5288 Attention mask shape: torch.Size([1, 1, 5288, 5288]) Position ids shape: torch.Size([1, 5288]) Input IDs shape: torch.Size([1, 5288]) Labels shape: torch.Size([1, 5288]) Final batch size: 1, sequence length: 28263 Attention mask shape: torch.Size([1, 1, 28263, 28263]) Position ids shape: torch.Size([1, 28263]) Input IDs shape: torch.Size([1, 28263]) Labels shape: torch.Size([1, 28263]) Final batch size: 1, sequence length: 12483 Attention mask shape: torch.Size([1, 1, 12483, 12483]) Position ids shape: torch.Size([1, 12483]) Input IDs shape: torch.Size([1, 12483]) Labels shape: torch.Size([1, 12483]) Final batch size: 1, sequence length: 26179 Attention mask shape: torch.Size([1, 1, 26179, 26179]) Position ids shape: torch.Size([1, 26179]) Input IDs shape: torch.Size([1, 26179]) Labels shape: torch.Size([1, 26179]) Final batch size: 1, sequence length: 30236 Attention mask shape: torch.Size([1, 1, 30236, 30236]) Position ids shape: torch.Size([1, 30236]) Input IDs shape: torch.Size([1, 30236]) Labels shape: torch.Size([1, 30236]) Final batch size: 1, sequence length: 30723 Attention mask shape: torch.Size([1, 1, 30723, 30723]) Position ids shape: torch.Size([1, 30723]) Input IDs shape: torch.Size([1, 30723]) Labels shape: torch.Size([1, 30723]) Final batch size: 1, sequence length: 26054 Attention mask shape: torch.Size([1, 1, 26054, 26054]) Position ids shape: torch.Size([1, 26054]) Input IDs shape: torch.Size([1, 26054]) Labels shape: torch.Size([1, 26054]) Final batch size: 1, sequence length: 23724 Attention mask shape: torch.Size([1, 1, 23724, 23724]) Position ids shape: torch.Size([1, 23724]) Input IDs shape: torch.Size([1, 23724]) Labels shape: torch.Size([1, 23724]) Final batch size: 1, sequence length: 26461 Attention mask shape: torch.Size([1, 1, 26461, 26461]) Position ids shape: torch.Size([1, 26461]) Input IDs shape: torch.Size([1, 26461]) Labels shape: torch.Size([1, 26461]) Final batch size: 1, sequence length: 32083 Attention mask shape: torch.Size([1, 1, 32083, 32083]) Position ids shape: torch.Size([1, 32083]) Input IDs shape: torch.Size([1, 32083]) Labels shape: torch.Size([1, 32083]) Final batch size: 1, sequence length: 29237 Attention mask shape: torch.Size([1, 1, 29237, 29237]) Position ids shape: torch.Size([1, 29237]) Input IDs shape: torch.Size([1, 29237]) Labels shape: torch.Size([1, 29237]) Final batch size: 1, sequence length: 26886 Attention mask shape: torch.Size([1, 1, 26886, 26886]) Position ids shape: torch.Size([1, 26886]) Input IDs shape: torch.Size([1, 26886]) Labels shape: torch.Size([1, 26886]) Final batch size: 1, sequence length: 20941 Attention mask shape: torch.Size([1, 1, 20941, 20941]) Position ids shape: torch.Size([1, 20941]) Input IDs shape: torch.Size([1, 20941]) Labels shape: torch.Size([1, 20941]) Final batch size: 1, sequence length: 28440 Attention mask shape: torch.Size([1, 1, 28440, 28440]) Position ids shape: torch.Size([1, 28440]) Input IDs shape: torch.Size([1, 28440]) Labels shape: torch.Size([1, 28440]) Final batch size: 1, sequence length: 13600 Attention mask shape: torch.Size([1, 1, 13600, 13600]) Position ids shape: torch.Size([1, 13600]) Input IDs shape: torch.Size([1, 13600]) Labels shape: torch.Size([1, 13600]) Final batch size: 1, sequence length: 27987 Attention mask shape: torch.Size([1, 1, 27987, 27987]) Position ids shape: torch.Size([1, 27987]) Input IDs shape: torch.Size([1, 27987]) Labels shape: torch.Size([1, 27987]) Final batch size: 1, sequence length: 32660 Attention mask shape: torch.Size([1, 1, 32660, 32660]) Position ids shape: torch.Size([1, 32660]) Input IDs shape: torch.Size([1, 32660]) Labels shape: torch.Size([1, 32660]) Final batch size: 1, sequence length: 17951 Attention mask shape: torch.Size([1, 1, 17951, 17951]) Position ids shape: torch.Size([1, 17951]) Input IDs shape: torch.Size([1, 17951]) Labels shape: torch.Size([1, 17951]) Final batch size: 1, sequence length: 32799 Attention mask shape: torch.Size([1, 1, 32799, 32799]) Position ids shape: torch.Size([1, 32799]) Input IDs shape: torch.Size([1, 32799]) Labels shape: torch.Size([1, 32799]) Final batch size: 1, sequence length: 27405 Attention mask shape: torch.Size([1, 1, 27405, 27405]) Position ids shape: torch.Size([1, 27405]) Input IDs shape: torch.Size([1, 27405]) Labels shape: torch.Size([1, 27405]) Final batch size: 1, sequence length: 29880 Attention mask shape: torch.Size([1, 1, 29880, 29880]) Position ids shape: torch.Size([1, 29880]) Input IDs shape: torch.Size([1, 29880]) Labels shape: torch.Size([1, 29880]) Final batch size: 1, sequence length: 17539 Attention mask shape: torch.Size([1, 1, 17539, 17539]) Position ids shape: torch.Size([1, 17539]) Input IDs shape: torch.Size([1, 17539]) Labels shape: torch.Size([1, 17539]) Final batch size: 1, sequence length: 21472 Attention mask shape: torch.Size([1, 1, 21472, 21472]) Position ids shape: torch.Size([1, 21472]) Input IDs shape: torch.Size([1, 21472]) Labels shape: torch.Size([1, 21472]) Final batch size: 1, sequence length: 29109 Attention mask shape: torch.Size([1, 1, 29109, 29109]) Position ids shape: torch.Size([1, 29109]) Input IDs shape: torch.Size([1, 29109]) Labels shape: torch.Size([1, 29109]) Final batch size: 1, sequence length: 23945 Attention mask shape: torch.Size([1, 1, 23945, 23945]) Position ids shape: torch.Size([1, 23945]) Input IDs shape: torch.Size([1, 23945]) Labels shape: torch.Size([1, 23945]) Final batch size: 1, sequence length: 34694 Attention mask shape: torch.Size([1, 1, 34694, 34694]) Position ids shape: torch.Size([1, 34694]) Input IDs shape: torch.Size([1, 34694]) Labels shape: torch.Size([1, 34694]) Final batch size: 1, sequence length: 29561 Attention mask shape: torch.Size([1, 1, 29561, 29561]) Position ids shape: torch.Size([1, 29561]) Input IDs shape: torch.Size([1, 29561]) Labels shape: torch.Size([1, 29561]) Final batch size: 1, sequence length: 29152 Attention mask shape: torch.Size([1, 1, 29152, 29152]) Position ids shape: torch.Size([1, 29152]) Input IDs shape: torch.Size([1, 29152]) Labels shape: torch.Size([1, 29152]) Final batch size: 1, sequence length: 17649 Attention mask shape: torch.Size([1, 1, 17649, 17649]) Position ids shape: torch.Size([1, 17649]) Input IDs shape: torch.Size([1, 17649]) Labels shape: torch.Size([1, 17649]) Final batch size: 1, sequence length: 26452 Attention mask shape: torch.Size([1, 1, 26452, 26452]) Position ids shape: torch.Size([1, 26452]) Input IDs shape: torch.Size([1, 26452]) Labels shape: torch.Size([1, 26452]) Final batch size: 1, sequence length: 31512 Attention mask shape: torch.Size([1, 1, 31512, 31512]) Position ids shape: torch.Size([1, 31512]) Input IDs shape: torch.Size([1, 31512]) Labels shape: torch.Size([1, 31512]) Final batch size: 1, sequence length: 8008 Attention mask shape: torch.Size([1, 1, 8008, 8008]) Position ids shape: torch.Size([1, 8008]) Input IDs shape: torch.Size([1, 8008]) Labels shape: torch.Size([1, 8008]) Final batch size: 1, sequence length: 32287 Attention mask shape: torch.Size([1, 1, 32287, 32287]) Position ids shape: torch.Size([1, 32287]) Input IDs shape: torch.Size([1, 32287]) Labels shape: torch.Size([1, 32287]) Final batch size: 1, sequence length: 35478 Attention mask shape: torch.Size([1, 1, 35478, 35478]) Position ids shape: torch.Size([1, 35478]) Input IDs shape: torch.Size([1, 35478]) Labels shape: torch.Size([1, 35478]) Final batch size: 1, sequence length: 35999 Attention mask shape: torch.Size([1, 1, 35999, 35999]) Position ids shape: torch.Size([1, 35999]) Input IDs shape: torch.Size([1, 35999]) Labels shape: torch.Size([1, 35999]) Final batch size: 1, sequence length: 15830 Attention mask shape: torch.Size([1, 1, 15830, 15830]) Position ids shape: torch.Size([1, 15830]) Input IDs shape: torch.Size([1, 15830]) Labels shape: torch.Size([1, 15830]) Final batch size: 1, sequence length: 7448 Attention mask shape: torch.Size([1, 1, 7448, 7448]) Position ids shape: torch.Size([1, 7448]) Input IDs shape: torch.Size([1, 7448]) Labels shape: torch.Size([1, 7448]) Final batch size: 1, sequence length: 24056 Attention mask shape: torch.Size([1, 1, 24056, 24056]) Position ids shape: torch.Size([1, 24056]) Input IDs shape: torch.Size([1, 24056]) Labels shape: torch.Size([1, 24056]) Final batch size: 1, sequence length: 16631 Attention mask shape: torch.Size([1, 1, 16631, 16631]) Position ids shape: torch.Size([1, 16631]) Input IDs shape: torch.Size([1, 16631]) Labels shape: torch.Size([1, 16631]) Final batch size: 1, sequence length: 15906 Attention mask shape: torch.Size([1, 1, 15906, 15906]) Position ids shape: torch.Size([1, 15906]) Input IDs shape: torch.Size([1, 15906]) Labels shape: torch.Size([1, 15906]) Final batch size: 1, sequence length: 17775 Attention mask shape: torch.Size([1, 1, 17775, 17775]) Position ids shape: torch.Size([1, 17775]) Input IDs shape: torch.Size([1, 17775]) Labels shape: torch.Size([1, 17775]) Final batch size: 1, sequence length: 35696 Attention mask shape: torch.Size([1, 1, 35696, 35696]) Position ids shape: torch.Size([1, 35696]) Input IDs shape: torch.Size([1, 35696]) Labels shape: torch.Size([1, 35696]) Final batch size: 1, sequence length: 36314 Attention mask shape: torch.Size([1, 1, 36314, 36314]) Position ids shape: torch.Size([1, 36314]) Input IDs shape: torch.Size([1, 36314]) Labels shape: torch.Size([1, 36314]) Final batch size: 1, sequence length: 16852 Attention mask shape: torch.Size([1, 1, 16852, 16852]) Position ids shape: torch.Size([1, 16852]) Input IDs shape: torch.Size([1, 16852]) Labels shape: torch.Size([1, 16852]) Final batch size: 1, sequence length: 29262 Attention mask shape: torch.Size([1, 1, 29262, 29262]) Position ids shape: torch.Size([1, 29262]) Input IDs shape: torch.Size([1, 29262]) Labels shape: torch.Size([1, 29262]) Final batch size: 1, sequence length: 31786 Attention mask shape: torch.Size([1, 1, 31786, 31786]) Position ids shape: torch.Size([1, 31786]) Input IDs shape: torch.Size([1, 31786]) Labels shape: torch.Size([1, 31786]) Final batch size: 1, sequence length: 35526 Attention mask shape: torch.Size([1, 1, 35526, 35526]) Position ids shape: torch.Size([1, 35526]) Input IDs shape: torch.Size([1, 35526]) Labels shape: torch.Size([1, 35526]) Final batch size: 1, sequence length: 39476 Attention mask shape: torch.Size([1, 1, 39476, 39476]) Position ids shape: torch.Size([1, 39476]) Input IDs shape: torch.Size([1, 39476]) Labels shape: torch.Size([1, 39476]) Final batch size: 1, sequence length: 28691 Attention mask shape: torch.Size([1, 1, 28691, 28691]) Position ids shape: torch.Size([1, 28691]) Input IDs shape: torch.Size([1, 28691]) Labels shape: torch.Size([1, 28691]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 34777 Attention mask shape: torch.Size([1, 1, 34777, 34777]) Position ids shape: torch.Size([1, 34777]) Input IDs shape: torch.Size([1, 34777]) Labels shape: torch.Size([1, 34777]) Final batch size: 1, sequence length: 37933 Attention mask shape: torch.Size([1, 1, 37933, 37933]) Position ids shape: torch.Size([1, 37933]) Input IDs shape: torch.Size([1, 37933]) Labels shape: torch.Size([1, 37933]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 10480 Attention mask shape: torch.Size([1, 1, 10480, 10480]) Position ids shape: torch.Size([1, 10480]) Input IDs shape: torch.Size([1, 10480]) Labels shape: torch.Size([1, 10480]) Final batch size: 1, sequence length: 11395 Attention mask shape: torch.Size([1, 1, 11395, 11395]) Position ids shape: torch.Size([1, 11395]) Input IDs shape: torch.Size([1, 11395]) Labels shape: torch.Size([1, 11395]) Final batch size: 1, sequence length: 28879 Attention mask shape: torch.Size([1, 1, 28879, 28879]) Position ids shape: torch.Size([1, 28879]) Input IDs shape: torch.Size([1, 28879]) Labels shape: torch.Size([1, 28879]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 18911 Attention mask shape: torch.Size([1, 1, 18911, 18911]) Position ids shape: torch.Size([1, 18911]) Input IDs shape: torch.Size([1, 18911]) Labels shape: torch.Size([1, 18911]) Final batch size: 1, sequence length: 28495 Attention mask shape: torch.Size([1, 1, 28495, 28495]) Position ids shape: torch.Size([1, 28495]) Input IDs shape: torch.Size([1, 28495]) Labels shape: torch.Size([1, 28495]) Final batch size: 1, sequence length: 24824 Attention mask shape: torch.Size([1, 1, 24824, 24824]) Position ids shape: torch.Size([1, 24824]) Input IDs shape: torch.Size([1, 24824]) Labels shape: torch.Size([1, 24824]) Final batch size: 1, sequence length: 20176 Attention mask shape: torch.Size([1, 1, 20176, 20176]) Position ids shape: torch.Size([1, 20176]) Input IDs shape: torch.Size([1, 20176]) Labels shape: torch.Size([1, 20176]) Final batch size: 1, sequence length: 20694 Attention mask shape: torch.Size([1, 1, 20694, 20694]) Position ids shape: torch.Size([1, 20694]) Input IDs shape: torch.Size([1, 20694]) Labels shape: torch.Size([1, 20694]) Final batch size: 1, sequence length: 31132 Attention mask shape: torch.Size([1, 1, 31132, 31132]) Position ids shape: torch.Size([1, 31132]) Input IDs shape: torch.Size([1, 31132]) Labels shape: torch.Size([1, 31132]) Final batch size: 1, sequence length: 38935 Attention mask shape: torch.Size([1, 1, 38935, 38935]) Position ids shape: torch.Size([1, 38935]) Input IDs shape: torch.Size([1, 38935]) Labels shape: torch.Size([1, 38935]) Final batch size: 1, sequence length: 18127 Attention mask shape: torch.Size([1, 1, 18127, 18127]) Position ids shape: torch.Size([1, 18127]) Input IDs shape: torch.Size([1, 18127]) Labels shape: torch.Size([1, 18127]) Final batch size: 1, sequence length: 6378 Attention mask shape: torch.Size([1, 1, 6378, 6378]) Position ids shape: torch.Size([1, 6378]) Input IDs shape: torch.Size([1, 6378]) Labels shape: torch.Size([1, 6378]) Final batch size: 1, sequence length: 14869 Attention mask shape: torch.Size([1, 1, 14869, 14869]) Position ids shape: torch.Size([1, 14869]) Input IDs shape: torch.Size([1, 14869]) Labels shape: torch.Size([1, 14869]) Final batch size: 1, sequence length: 30024 Attention mask shape: torch.Size([1, 1, 30024, 30024]) Position ids shape: torch.Size([1, 30024]) Input IDs shape: torch.Size([1, 30024]) Labels shape: torch.Size([1, 30024]) Final batch size: 1, sequence length: 39687 Attention mask shape: torch.Size([1, 1, 39687, 39687]) Position ids shape: torch.Size([1, 39687]) Input IDs shape: torch.Size([1, 39687]) Labels shape: torch.Size([1, 39687]) Final batch size: 1, sequence length: 37159 Attention mask shape: torch.Size([1, 1, 37159, 37159]) Position ids shape: torch.Size([1, 37159]) Input IDs shape: torch.Size([1, 37159]) Labels shape: torch.Size([1, 37159]) Final batch size: 1, sequence length: 39324 Attention mask shape: torch.Size([1, 1, 39324, 39324]) Position ids shape: torch.Size([1, 39324]) Input IDs shape: torch.Size([1, 39324]) Labels shape: torch.Size([1, 39324]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40605 Attention mask shape: torch.Size([1, 1, 40605, 40605]) Position ids shape: torch.Size([1, 40605]) Input IDs shape: torch.Size([1, 40605]) Labels shape: torch.Size([1, 40605]) Final batch size: 1, sequence length: 28313 Attention mask shape: torch.Size([1, 1, 28313, 28313]) Position ids shape: torch.Size([1, 28313]) Input IDs shape: torch.Size([1, 28313]) Labels shape: torch.Size([1, 28313]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 35116 Attention mask shape: torch.Size([1, 1, 35116, 35116]) Position ids shape: torch.Size([1, 35116]) Input IDs shape: torch.Size([1, 35116]) Labels shape: torch.Size([1, 35116]) Final batch size: 1, sequence length: 36469 Attention mask shape: torch.Size([1, 1, 36469, 36469]) Position ids shape: torch.Size([1, 36469]) Input IDs shape: torch.Size([1, 36469]) Labels shape: torch.Size([1, 36469]) Final batch size: 1, sequence length: 30245 Attention mask shape: torch.Size([1, 1, 30245, 30245]) Position ids shape: torch.Size([1, 30245]) Input IDs shape: torch.Size([1, 30245]) Labels shape: torch.Size([1, 30245]) Final batch size: 1, sequence length: 33070 Attention mask shape: torch.Size([1, 1, 33070, 33070]) Position ids shape: torch.Size([1, 33070]) Input IDs shape: torch.Size([1, 33070]) Labels shape: torch.Size([1, 33070]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 26402 Attention mask shape: torch.Size([1, 1, 26402, 26402]) Position ids shape: torch.Size([1, 26402]) Input IDs shape: torch.Size([1, 26402]) Labels shape: torch.Size([1, 26402]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) {'loss': 0.385, 'grad_norm': 1.101009209841372, 'learning_rate': 1e-05, 'num_tokens': -inf, 'epoch': 0.62} Final batch size: 1, sequence length: 3968 Attention mask shape: torch.Size([1, 1, 3968, 3968]) Position ids shape: torch.Size([1, 3968]) Input IDs shape: torch.Size([1, 3968]) Labels shape: torch.Size([1, 3968]) Final batch size: 1, sequence length: 6567 Attention mask shape: torch.Size([1, 1, 6567, 6567]) Position ids shape: torch.Size([1, 6567]) Input IDs shape: torch.Size([1, 6567]) Labels shape: torch.Size([1, 6567]) Final batch size: 1, sequence length: 8347 Attention mask shape: torch.Size([1, 1, 8347, 8347]) Position ids shape: torch.Size([1, 8347]) Input IDs shape: torch.Size([1, 8347]) Labels shape: torch.Size([1, 8347]) Final batch size: 1, sequence length: 5997 Attention mask shape: torch.Size([1, 1, 5997, 5997]) Position ids shape: torch.Size([1, 5997]) Input IDs shape: torch.Size([1, 5997]) Labels shape: torch.Size([1, 5997]) Final batch size: 1, sequence length: 9027 Attention mask shape: torch.Size([1, 1, 9027, 9027]) Position ids shape: torch.Size([1, 9027]) Input IDs shape: torch.Size([1, 9027]) Labels shape: torch.Size([1, 9027]) Final batch size: 1, sequence length: 11752 Attention mask shape: torch.Size([1, 1, 11752, 11752]) Position ids shape: torch.Size([1, 11752]) Input IDs shape: torch.Size([1, 11752]) Labels shape: torch.Size([1, 11752]) Final batch size: 1, sequence length: 11232 Attention mask shape: torch.Size([1, 1, 11232, 11232]) Position ids shape: torch.Size([1, 11232]) Input IDs shape: torch.Size([1, 11232]) Labels shape: torch.Size([1, 11232]) Final batch size: 1, sequence length: 11054 Attention mask shape: torch.Size([1, 1, 11054, 11054]) Position ids shape: torch.Size([1, 11054]) Input IDs shape: torch.Size([1, 11054]) Labels shape: torch.Size([1, 11054]) Final batch size: 1, sequence length: 12967 Attention mask shape: torch.Size([1, 1, 12967, 12967]) Position ids shape: torch.Size([1, 12967]) Input IDs shape: torch.Size([1, 12967]) Labels shape: torch.Size([1, 12967]) Final batch size: 1, sequence length: 13260 Attention mask shape: torch.Size([1, 1, 13260, 13260]) Position ids shape: torch.Size([1, 13260]) Input IDs shape: torch.Size([1, 13260]) Labels shape: torch.Size([1, 13260]) Final batch size: 1, sequence length: 11880 Attention mask shape: torch.Size([1, 1, 11880, 11880]) Position ids shape: torch.Size([1, 11880]) Input IDs shape: torch.Size([1, 11880]) Labels shape: torch.Size([1, 11880]) Final batch size: 1, sequence length: 14512 Attention mask shape: torch.Size([1, 1, 14512, 14512]) Position ids shape: torch.Size([1, 14512]) Input IDs shape: torch.Size([1, 14512]) Labels shape: torch.Size([1, 14512]) Final batch size: 1, sequence length: 16014 Attention mask shape: torch.Size([1, 1, 16014, 16014]) Position ids shape: torch.Size([1, 16014]) Input IDs shape: torch.Size([1, 16014]) Labels shape: torch.Size([1, 16014]) Final batch size: 1, sequence length: 14758 Attention mask shape: torch.Size([1, 1, 14758, 14758]) Position ids shape: torch.Size([1, 14758]) Input IDs shape: torch.Size([1, 14758]) Labels shape: torch.Size([1, 14758]) Final batch size: 1, sequence length: 14597 Attention mask shape: torch.Size([1, 1, 14597, 14597]) Position ids shape: torch.Size([1, 14597]) Input IDs shape: torch.Size([1, 14597]) Labels shape: torch.Size([1, 14597]) Final batch size: 1, sequence length: 17166 Attention mask shape: torch.Size([1, 1, 17166, 17166]) Position ids shape: torch.Size([1, 17166]) Input IDs shape: torch.Size([1, 17166]) Labels shape: torch.Size([1, 17166]) Final batch size: 1, sequence length: 12370 Attention mask shape: torch.Size([1, 1, 12370, 12370]) Position ids shape: torch.Size([1, 12370]) Input IDs shape: torch.Size([1, 12370]) Labels shape: torch.Size([1, 12370]) Final batch size: 1, sequence length: 15438 Attention mask shape: torch.Size([1, 1, 15438, 15438]) Position ids shape: torch.Size([1, 15438]) Input IDs shape: torch.Size([1, 15438]) Labels shape: torch.Size([1, 15438]) Final batch size: 1, sequence length: 17092 Attention mask shape: torch.Size([1, 1, 17092, 17092]) Position ids shape: torch.Size([1, 17092]) Input IDs shape: torch.Size([1, 17092]) Labels shape: torch.Size([1, 17092]) Final batch size: 1, sequence length: 18050 Attention mask shape: torch.Size([1, 1, 18050, 18050]) Position ids shape: torch.Size([1, 18050]) Input IDs shape: torch.Size([1, 18050]) Labels shape: torch.Size([1, 18050]) Final batch size: 1, sequence length: 17839 Attention mask shape: torch.Size([1, 1, 17839, 17839]) Position ids shape: torch.Size([1, 17839]) Input IDs shape: torch.Size([1, 17839]) Labels shape: torch.Size([1, 17839]) Final batch size: 1, sequence length: 18264 Attention mask shape: torch.Size([1, 1, 18264, 18264]) Position ids shape: torch.Size([1, 18264]) Input IDs shape: torch.Size([1, 18264]) Labels shape: torch.Size([1, 18264]) Final batch size: 1, sequence length: 14520 Attention mask shape: torch.Size([1, 1, 14520, 14520]) Position ids shape: torch.Size([1, 14520]) Input IDs shape: torch.Size([1, 14520]) Labels shape: torch.Size([1, 14520]) Final batch size: 1, sequence length: 11206 Attention mask shape: torch.Size([1, 1, 11206, 11206]) Position ids shape: torch.Size([1, 11206]) Input IDs shape: torch.Size([1, 11206]) Labels shape: torch.Size([1, 11206]) Final batch size: 1, sequence length: 20487 Attention mask shape: torch.Size([1, 1, 20487, 20487]) Position ids shape: torch.Size([1, 20487]) Input IDs shape: torch.Size([1, 20487]) Labels shape: torch.Size([1, 20487]) Final batch size: 1, sequence length: 17194 Attention mask shape: torch.Size([1, 1, 17194, 17194]) Position ids shape: torch.Size([1, 17194]) Input IDs shape: torch.Size([1, 17194]) Labels shape: torch.Size([1, 17194]) Final batch size: 1, sequence length: 20947 Attention mask shape: torch.Size([1, 1, 20947, 20947]) Position ids shape: torch.Size([1, 20947]) Input IDs shape: torch.Size([1, 20947]) Labels shape: torch.Size([1, 20947]) Final batch size: 1, sequence length: 18131 Attention mask shape: torch.Size([1, 1, 18131, 18131]) Position ids shape: torch.Size([1, 18131]) Input IDs shape: torch.Size([1, 18131]) Labels shape: torch.Size([1, 18131]) Final batch size: 1, sequence length: 13061 Attention mask shape: torch.Size([1, 1, 13061, 13061]) Position ids shape: torch.Size([1, 13061]) Input IDs shape: torch.Size([1, 13061]) Labels shape: torch.Size([1, 13061]) Final batch size: 1, sequence length: 14335 Attention mask shape: torch.Size([1, 1, 14335, 14335]) Position ids shape: torch.Size([1, 14335]) Input IDs shape: torch.Size([1, 14335]) Labels shape: torch.Size([1, 14335]) Final batch size: 1, sequence length: 23324 Attention mask shape: torch.Size([1, 1, 23324, 23324]) Position ids shape: torch.Size([1, 23324]) Input IDs shape: torch.Size([1, 23324]) Labels shape: torch.Size([1, 23324]) Final batch size: 1, sequence length: 22547 Attention mask shape: torch.Size([1, 1, 22547, 22547]) Position ids shape: torch.Size([1, 22547]) Input IDs shape: torch.Size([1, 22547]) Labels shape: torch.Size([1, 22547]) Final batch size: 1, sequence length: 17115 Attention mask shape: torch.Size([1, 1, 17115, 17115]) Position ids shape: torch.Size([1, 17115]) Input IDs shape: torch.Size([1, 17115]) Labels shape: torch.Size([1, 17115]) Final batch size: 1, sequence length: 18950 Attention mask shape: torch.Size([1, 1, 18950, 18950]) Position ids shape: torch.Size([1, 18950]) Input IDs shape: torch.Size([1, 18950]) Labels shape: torch.Size([1, 18950]) Final batch size: 1, sequence length: 24566 Attention mask shape: torch.Size([1, 1, 24566, 24566]) Position ids shape: torch.Size([1, 24566]) Input IDs shape: torch.Size([1, 24566]) Labels shape: torch.Size([1, 24566]) Final batch size: 1, sequence length: 21137 Attention mask shape: torch.Size([1, 1, 21137, 21137]) Position ids shape: torch.Size([1, 21137]) Input IDs shape: torch.Size([1, 21137]) Labels shape: torch.Size([1, 21137]) Final batch size: 1, sequence length: 26068 Attention mask shape: torch.Size([1, 1, 26068, 26068]) Position ids shape: torch.Size([1, 26068]) Input IDs shape: torch.Size([1, 26068]) Labels shape: torch.Size([1, 26068]) Final batch size: 1, sequence length: 22107 Attention mask shape: torch.Size([1, 1, 22107, 22107]) Position ids shape: torch.Size([1, 22107]) Input IDs shape: torch.Size([1, 22107]) Labels shape: torch.Size([1, 22107]) Final batch size: 1, sequence length: 24432 Attention mask shape: torch.Size([1, 1, 24432, 24432]) Position ids shape: torch.Size([1, 24432]) Input IDs shape: torch.Size([1, 24432]) Labels shape: torch.Size([1, 24432]) Final batch size: 1, sequence length: 26271 Attention mask shape: torch.Size([1, 1, 26271, 26271]) Position ids shape: torch.Size([1, 26271]) Input IDs shape: torch.Size([1, 26271]) Labels shape: torch.Size([1, 26271]) Final batch size: 1, sequence length: 15229 Attention mask shape: torch.Size([1, 1, 15229, 15229]) Position ids shape: torch.Size([1, 15229]) Input IDs shape: torch.Size([1, 15229]) Labels shape: torch.Size([1, 15229]) Final batch size: 1, sequence length: 23960 Attention mask shape: torch.Size([1, 1, 23960, 23960]) Position ids shape: torch.Size([1, 23960]) Input IDs shape: torch.Size([1, 23960]) Labels shape: torch.Size([1, 23960]) Final batch size: 1, sequence length: 26937 Attention mask shape: torch.Size([1, 1, 26937, 26937]) Position ids shape: torch.Size([1, 26937]) Input IDs shape: torch.Size([1, 26937]) Labels shape: torch.Size([1, 26937]) Final batch size: 1, sequence length: 28284 Attention mask shape: torch.Size([1, 1, 28284, 28284]) Position ids shape: torch.Size([1, 28284]) Input IDs shape: torch.Size([1, 28284]) Labels shape: torch.Size([1, 28284]) Final batch size: 1, sequence length: 29236 Attention mask shape: torch.Size([1, 1, 29236, 29236]) Position ids shape: torch.Size([1, 29236]) Input IDs shape: torch.Size([1, 29236]) Labels shape: torch.Size([1, 29236]) Final batch size: 1, sequence length: 28858 Attention mask shape: torch.Size([1, 1, 28858, 28858]) Position ids shape: torch.Size([1, 28858]) Input IDs shape: torch.Size([1, 28858]) Labels shape: torch.Size([1, 28858]) Final batch size: 1, sequence length: 26333 Attention mask shape: torch.Size([1, 1, 26333, 26333]) Position ids shape: torch.Size([1, 26333]) Input IDs shape: torch.Size([1, 26333]) Labels shape: torch.Size([1, 26333]) Final batch size: 1, sequence length: 25252 Attention mask shape: torch.Size([1, 1, 25252, 25252]) Position ids shape: torch.Size([1, 25252]) Input IDs shape: torch.Size([1, 25252]) Labels shape: torch.Size([1, 25252]) Final batch size: 1, sequence length: 29768 Attention mask shape: torch.Size([1, 1, 29768, 29768]) Position ids shape: torch.Size([1, 29768]) Input IDs shape: torch.Size([1, 29768]) Labels shape: torch.Size([1, 29768]) Final batch size: 1, sequence length: 29478 Attention mask shape: torch.Size([1, 1, 29478, 29478]) Position ids shape: torch.Size([1, 29478]) Input IDs shape: torch.Size([1, 29478]) Labels shape: torch.Size([1, 29478]) Final batch size: 1, sequence length: 10286 Attention mask shape: torch.Size([1, 1, 10286, 10286]) Position ids shape: torch.Size([1, 10286]) Input IDs shape: torch.Size([1, 10286]) Labels shape: torch.Size([1, 10286]) Final batch size: 1, sequence length: 27293 Attention mask shape: torch.Size([1, 1, 27293, 27293]) Position ids shape: torch.Size([1, 27293]) Input IDs shape: torch.Size([1, 27293]) Labels shape: torch.Size([1, 27293]) Final batch size: 1, sequence length: 21728 Attention mask shape: torch.Size([1, 1, 21728, 21728]) Position ids shape: torch.Size([1, 21728]) Input IDs shape: torch.Size([1, 21728]) Labels shape: torch.Size([1, 21728]) Final batch size: 1, sequence length: 31740 Attention mask shape: torch.Size([1, 1, 31740, 31740]) Position ids shape: torch.Size([1, 31740]) Input IDs shape: torch.Size([1, 31740]) Labels shape: torch.Size([1, 31740]) Final batch size: 1, sequence length: 30862 Attention mask shape: torch.Size([1, 1, 30862, 30862]) Position ids shape: torch.Size([1, 30862]) Input IDs shape: torch.Size([1, 30862]) Labels shape: torch.Size([1, 30862]) Final batch size: 1, sequence length: 28421 Attention mask shape: torch.Size([1, 1, 28421, 28421]) Position ids shape: torch.Size([1, 28421]) Input IDs shape: torch.Size([1, 28421]) Labels shape: torch.Size([1, 28421]) Final batch size: 1, sequence length: 18051 Attention mask shape: torch.Size([1, 1, 18051, 18051]) Position ids shape: torch.Size([1, 18051]) Input IDs shape: torch.Size([1, 18051]) Labels shape: torch.Size([1, 18051]) Final batch size: 1, sequence length: 32392 Attention mask shape: torch.Size([1, 1, 32392, 32392]) Position ids shape: torch.Size([1, 32392]) Input IDs shape: torch.Size([1, 32392]) Labels shape: torch.Size([1, 32392]) Final batch size: 1, sequence length: 31377 Attention mask shape: torch.Size([1, 1, 31377, 31377]) Position ids shape: torch.Size([1, 31377]) Input IDs shape: torch.Size([1, 31377]) Labels shape: torch.Size([1, 31377]) Final batch size: 1, sequence length: 27278 Attention mask shape: torch.Size([1, 1, 27278, 27278]) Position ids shape: torch.Size([1, 27278]) Input IDs shape: torch.Size([1, 27278]) Labels shape: torch.Size([1, 27278]) Final batch size: 1, sequence length: 7364 Attention mask shape: torch.Size([1, 1, 7364, 7364]) Position ids shape: torch.Size([1, 7364]) Input IDs shape: torch.Size([1, 7364]) Labels shape: torch.Size([1, 7364]) Final batch size: 1, sequence length: 29592 Attention mask shape: torch.Size([1, 1, 29592, 29592]) Position ids shape: torch.Size([1, 29592]) Input IDs shape: torch.Size([1, 29592]) Labels shape: torch.Size([1, 29592]) Final batch size: 1, sequence length: 13257 Attention mask shape: torch.Size([1, 1, 13257, 13257]) Position ids shape: torch.Size([1, 13257]) Input IDs shape: torch.Size([1, 13257]) Labels shape: torch.Size([1, 13257]) Final batch size: 1, sequence length: 31860 Attention mask shape: torch.Size([1, 1, 31860, 31860]) Position ids shape: torch.Size([1, 31860]) Input IDs shape: torch.Size([1, 31860]) Labels shape: torch.Size([1, 31860]) Final batch size: 1, sequence length: 18408 Attention mask shape: torch.Size([1, 1, 18408, 18408]) Position ids shape: torch.Size([1, 18408]) Input IDs shape: torch.Size([1, 18408]) Labels shape: torch.Size([1, 18408]) Final batch size: 1, sequence length: 29489 Attention mask shape: torch.Size([1, 1, 29489, 29489]) Position ids shape: torch.Size([1, 29489]) Input IDs shape: torch.Size([1, 29489]) Labels shape: torch.Size([1, 29489]) Final batch size: 1, sequence length: 33097 Attention mask shape: torch.Size([1, 1, 33097, 33097]) Position ids shape: torch.Size([1, 33097]) Input IDs shape: torch.Size([1, 33097]) Labels shape: torch.Size([1, 33097]) Final batch size: 1, sequence length: 9974 Attention mask shape: torch.Size([1, 1, 9974, 9974]) Position ids shape: torch.Size([1, 9974]) Input IDs shape: torch.Size([1, 9974]) Labels shape: torch.Size([1, 9974]) Final batch size: 1, sequence length: 21500 Attention mask shape: torch.Size([1, 1, 21500, 21500]) Position ids shape: torch.Size([1, 21500]) Input IDs shape: torch.Size([1, 21500]) Labels shape: torch.Size([1, 21500]) Final batch size: 1, sequence length: 35171 Attention mask shape: torch.Size([1, 1, 35171, 35171]) Position ids shape: torch.Size([1, 35171]) Input IDs shape: torch.Size([1, 35171]) Labels shape: torch.Size([1, 35171]) Final batch size: 1, sequence length: 32181 Attention mask shape: torch.Size([1, 1, 32181, 32181]) Position ids shape: torch.Size([1, 32181]) Input IDs shape: torch.Size([1, 32181]) Labels shape: torch.Size([1, 32181]) Final batch size: 1, sequence length: 32112 Attention mask shape: torch.Size([1, 1, 32112, 32112]) Position ids shape: torch.Size([1, 32112]) Input IDs shape: torch.Size([1, 32112]) Labels shape: torch.Size([1, 32112]) Final batch size: 1, sequence length: 19888 Attention mask shape: torch.Size([1, 1, 19888, 19888]) Position ids shape: torch.Size([1, 19888]) Input IDs shape: torch.Size([1, 19888]) Labels shape: torch.Size([1, 19888]) Final batch size: 1, sequence length: 33736 Attention mask shape: torch.Size([1, 1, 33736, 33736]) Position ids shape: torch.Size([1, 33736]) Input IDs shape: torch.Size([1, 33736]) Labels shape: torch.Size([1, 33736]) Final batch size: 1, sequence length: 35069 Attention mask shape: torch.Size([1, 1, 35069, 35069]) Position ids shape: torch.Size([1, 35069]) Input IDs shape: torch.Size([1, 35069]) Labels shape: torch.Size([1, 35069]) Final batch size: 1, sequence length: 37623 Attention mask shape: torch.Size([1, 1, 37623, 37623]) Position ids shape: torch.Size([1, 37623]) Input IDs shape: torch.Size([1, 37623]) Labels shape: torch.Size([1, 37623]) Final batch size: 1, sequence length: 34514 Attention mask shape: torch.Size([1, 1, 34514, 34514]) Position ids shape: torch.Size([1, 34514]) Input IDs shape: torch.Size([1, 34514]) Labels shape: torch.Size([1, 34514]) Final batch size: 1, sequence length: 30051 Attention mask shape: torch.Size([1, 1, 30051, 30051]) Position ids shape: torch.Size([1, 30051]) Input IDs shape: torch.Size([1, 30051]) Labels shape: torch.Size([1, 30051]) Final batch size: 1, sequence length: 32162 Attention mask shape: torch.Size([1, 1, 32162, 32162]) Position ids shape: torch.Size([1, 32162]) Input IDs shape: torch.Size([1, 32162]) Labels shape: torch.Size([1, 32162]) Final batch size: 1, sequence length: 38351 Attention mask shape: torch.Size([1, 1, 38351, 38351]) Position ids shape: torch.Size([1, 38351]) Input IDs shape: torch.Size([1, 38351]) Labels shape: torch.Size([1, 38351]) Final batch size: 1, sequence length: 35596 Attention mask shape: torch.Size([1, 1, 35596, 35596]) Position ids shape: torch.Size([1, 35596]) Input IDs shape: torch.Size([1, 35596]) Labels shape: torch.Size([1, 35596]) Final batch size: 1, sequence length: 38790 Attention mask shape: torch.Size([1, 1, 38790, 38790]) Position ids shape: torch.Size([1, 38790]) Input IDs shape: torch.Size([1, 38790]) Labels shape: torch.Size([1, 38790]) Final batch size: 1, sequence length: 25484 Attention mask shape: torch.Size([1, 1, 25484, 25484]) Position ids shape: torch.Size([1, 25484]) Input IDs shape: torch.Size([1, 25484]) Labels shape: torch.Size([1, 25484]) Final batch size: 1, sequence length: 13557 Attention mask shape: torch.Size([1, 1, 13557, 13557]) Position ids shape: torch.Size([1, 13557]) Input IDs shape: torch.Size([1, 13557]) Labels shape: torch.Size([1, 13557]) Final batch size: 1, sequence length: 25386 Attention mask shape: torch.Size([1, 1, 25386, 25386]) Position ids shape: torch.Size([1, 25386]) Input IDs shape: torch.Size([1, 25386]) Labels shape: torch.Size([1, 25386]) Final batch size: 1, sequence length: 18711 Attention mask shape: torch.Size([1, 1, 18711, 18711]) Position ids shape: torch.Size([1, 18711]) Input IDs shape: torch.Size([1, 18711]) Labels shape: torch.Size([1, 18711]) Final batch size: 1, sequence length: 34871 Attention mask shape: torch.Size([1, 1, 34871, 34871]) Position ids shape: torch.Size([1, 34871]) Input IDs shape: torch.Size([1, 34871]) Labels shape: torch.Size([1, 34871]) Final batch size: 1, sequence length: 17777 Attention mask shape: torch.Size([1, 1, 17777, 17777]) Position ids shape: torch.Size([1, 17777]) Input IDs shape: torch.Size([1, 17777]) Labels shape: torch.Size([1, 17777]) Final batch size: 1, sequence length: 32979 Attention mask shape: torch.Size([1, 1, 32979, 32979]) Position ids shape: torch.Size([1, 32979]) Input IDs shape: torch.Size([1, 32979]) Labels shape: torch.Size([1, 32979]) Final batch size: 1, sequence length: 21408 Attention mask shape: torch.Size([1, 1, 21408, 21408]) Position ids shape: torch.Size([1, 21408]) Input IDs shape: torch.Size([1, 21408]) Labels shape: torch.Size([1, 21408]) Final batch size: 1, sequence length: 30302 Attention mask shape: torch.Size([1, 1, 30302, 30302]) Position ids shape: torch.Size([1, 30302]) Input IDs shape: torch.Size([1, 30302]) Labels shape: torch.Size([1, 30302]) Final batch size: 1, sequence length: 24061 Attention mask shape: torch.Size([1, 1, 24061, 24061]) Position ids shape: torch.Size([1, 24061]) Input IDs shape: torch.Size([1, 24061]) Labels shape: torch.Size([1, 24061]) Final batch size: 1, sequence length: 31679 Attention mask shape: torch.Size([1, 1, 31679, 31679]) Position ids shape: torch.Size([1, 31679]) Input IDs shape: torch.Size([1, 31679]) Labels shape: torch.Size([1, 31679]) Final batch size: 1, sequence length: 30639 Attention mask shape: torch.Size([1, 1, 30639, 30639]) Position ids shape: torch.Size([1, 30639]) Input IDs shape: torch.Size([1, 30639]) Labels shape: torch.Size([1, 30639]) Final batch size: 1, sequence length: 24943 Attention mask shape: torch.Size([1, 1, 24943, 24943]) Position ids shape: torch.Size([1, 24943]) Input IDs shape: torch.Size([1, 24943]) Labels shape: torch.Size([1, 24943]) Final batch size: 1, sequence length: 39754 Attention mask shape: torch.Size([1, 1, 39754, 39754]) Position ids shape: torch.Size([1, 39754]) Input IDs shape: torch.Size([1, 39754]) Labels shape: torch.Size([1, 39754]) Final batch size: 1, sequence length: 31513 Attention mask shape: torch.Size([1, 1, 31513, 31513]) Position ids shape: torch.Size([1, 31513]) Input IDs shape: torch.Size([1, 31513]) Labels shape: torch.Size([1, 31513]) Final batch size: 1, sequence length: 34170 Attention mask shape: torch.Size([1, 1, 34170, 34170]) Position ids shape: torch.Size([1, 34170]) Input IDs shape: torch.Size([1, 34170]) Labels shape: torch.Size([1, 34170]) Final batch size: 1, sequence length: 32327 Attention mask shape: torch.Size([1, 1, 32327, 32327]) Position ids shape: torch.Size([1, 32327]) Input IDs shape: torch.Size([1, 32327]) Labels shape: torch.Size([1, 32327]) Final batch size: 1, sequence length: 36535 Attention mask shape: torch.Size([1, 1, 36535, 36535]) Position ids shape: torch.Size([1, 36535]) Input IDs shape: torch.Size([1, 36535]) Labels shape: torch.Size([1, 36535]) Final batch size: 1, sequence length: 23243 Attention mask shape: torch.Size([1, 1, 23243, 23243]) Position ids shape: torch.Size([1, 23243]) Input IDs shape: torch.Size([1, 23243]) Labels shape: torch.Size([1, 23243]) Final batch size: 1, sequence length: 18106 Attention mask shape: torch.Size([1, 1, 18106, 18106]) Position ids shape: torch.Size([1, 18106]) Input IDs shape: torch.Size([1, 18106]) Labels shape: torch.Size([1, 18106]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 33807 Attention mask shape: torch.Size([1, 1, 33807, 33807]) Position ids shape: torch.Size([1, 33807]) Input IDs shape: torch.Size([1, 33807]) Labels shape: torch.Size([1, 33807]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 21825 Attention mask shape: torch.Size([1, 1, 21825, 21825]) Position ids shape: torch.Size([1, 21825]) Input IDs shape: torch.Size([1, 21825]) Labels shape: torch.Size([1, 21825]) Final batch size: 1, sequence length: 24840 Attention mask shape: torch.Size([1, 1, 24840, 24840]) Position ids shape: torch.Size([1, 24840]) Input IDs shape: torch.Size([1, 24840]) Labels shape: torch.Size([1, 24840]) Final batch size: 1, sequence length: 39483 Attention mask shape: torch.Size([1, 1, 39483, 39483]) Position ids shape: torch.Size([1, 39483]) Input IDs shape: torch.Size([1, 39483]) Labels shape: torch.Size([1, 39483]) Final batch size: 1, sequence length: 15697 Attention mask shape: torch.Size([1, 1, 15697, 15697]) Position ids shape: torch.Size([1, 15697]) Input IDs shape: torch.Size([1, 15697]) Labels shape: torch.Size([1, 15697]) Final batch size: 1, sequence length: 24910 Attention mask shape: torch.Size([1, 1, 24910, 24910]) Position ids shape: torch.Size([1, 24910]) Input IDs shape: torch.Size([1, 24910]) Labels shape: torch.Size([1, 24910]) Final batch size: 1, sequence length: 33883 Attention mask shape: torch.Size([1, 1, 33883, 33883]) Position ids shape: torch.Size([1, 33883]) Input IDs shape: torch.Size([1, 33883]) Labels shape: torch.Size([1, 33883]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 27228 Attention mask shape: torch.Size([1, 1, 27228, 27228]) Position ids shape: torch.Size([1, 27228]) Input IDs shape: torch.Size([1, 27228]) Labels shape: torch.Size([1, 27228]) Final batch size: 1, sequence length: 30361 Attention mask shape: torch.Size([1, 1, 30361, 30361]) Position ids shape: torch.Size([1, 30361]) Input IDs shape: torch.Size([1, 30361]) Labels shape: torch.Size([1, 30361]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 13064 Attention mask shape: torch.Size([1, 1, 13064, 13064]) Position ids shape: torch.Size([1, 13064]) Input IDs shape: torch.Size([1, 13064]) Labels shape: torch.Size([1, 13064]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 29691 Attention mask shape: torch.Size([1, 1, 29691, 29691]) Position ids shape: torch.Size([1, 29691]) Input IDs shape: torch.Size([1, 29691]) Labels shape: torch.Size([1, 29691]) Final batch size: 1, sequence length: 15180 Attention mask shape: torch.Size([1, 1, 15180, 15180]) Position ids shape: torch.Size([1, 15180]) Input IDs shape: torch.Size([1, 15180]) Labels shape: torch.Size([1, 15180]) Final batch size: 1, sequence length: 33549 Attention mask shape: torch.Size([1, 1, 33549, 33549]) Position ids shape: torch.Size([1, 33549]) Input IDs shape: torch.Size([1, 33549]) Labels shape: torch.Size([1, 33549]) Final batch size: 1, sequence length: 36532 Attention mask shape: torch.Size([1, 1, 36532, 36532]) Position ids shape: torch.Size([1, 36532]) Input IDs shape: torch.Size([1, 36532]) Labels shape: torch.Size([1, 36532]) Final batch size: 1, sequence length: 30419 Attention mask shape: torch.Size([1, 1, 30419, 30419]) Position ids shape: torch.Size([1, 30419]) Input IDs shape: torch.Size([1, 30419]) Labels shape: torch.Size([1, 30419]) Final batch size: 1, sequence length: 32351 Attention mask shape: torch.Size([1, 1, 32351, 32351]) Position ids shape: torch.Size([1, 32351]) Input IDs shape: torch.Size([1, 32351]) Labels shape: torch.Size([1, 32351]) {'loss': 0.3802, 'grad_norm': 1.1972050833760928, 'learning_rate': 9.993147673772869e-06, 'num_tokens': -inf, 'epoch': 0.75} Final batch size: 1, sequence length: 8845 Attention mask shape: torch.Size([1, 1, 8845, 8845]) Position ids shape: torch.Size([1, 8845]) Input IDs shape: torch.Size([1, 8845]) Labels shape: torch.Size([1, 8845]) Final batch size: 1, sequence length: 7235 Attention mask shape: torch.Size([1, 1, 7235, 7235]) Position ids shape: torch.Size([1, 7235]) Input IDs shape: torch.Size([1, 7235]) Labels shape: torch.Size([1, 7235]) Final batch size: 1, sequence length: 12215 Attention mask shape: torch.Size([1, 1, 12215, 12215]) Position ids shape: torch.Size([1, 12215]) Input IDs shape: torch.Size([1, 12215]) Labels shape: torch.Size([1, 12215]) Final batch size: 1, sequence length: 12830 Attention mask shape: torch.Size([1, 1, 12830, 12830]) Position ids shape: torch.Size([1, 12830]) Input IDs shape: torch.Size([1, 12830]) Labels shape: torch.Size([1, 12830]) Final batch size: 1, sequence length: 13665 Attention mask shape: torch.Size([1, 1, 13665, 13665]) Position ids shape: torch.Size([1, 13665]) Input IDs shape: torch.Size([1, 13665]) Labels shape: torch.Size([1, 13665]) Final batch size: 1, sequence length: 13575 Attention mask shape: torch.Size([1, 1, 13575, 13575]) Position ids shape: torch.Size([1, 13575]) Input IDs shape: torch.Size([1, 13575]) Labels shape: torch.Size([1, 13575]) Final batch size: 1, sequence length: 14689 Attention mask shape: torch.Size([1, 1, 14689, 14689]) Position ids shape: torch.Size([1, 14689]) Input IDs shape: torch.Size([1, 14689]) Labels shape: torch.Size([1, 14689]) Final batch size: 1, sequence length: 12622 Attention mask shape: torch.Size([1, 1, 12622, 12622]) Position ids shape: torch.Size([1, 12622]) Input IDs shape: torch.Size([1, 12622]) Labels shape: torch.Size([1, 12622]) Final batch size: 1, sequence length: 14482 Attention mask shape: torch.Size([1, 1, 14482, 14482]) Position ids shape: torch.Size([1, 14482]) Input IDs shape: torch.Size([1, 14482]) Labels shape: torch.Size([1, 14482]) Final batch size: 1, sequence length: 17016 Attention mask shape: torch.Size([1, 1, 17016, 17016]) Position ids shape: torch.Size([1, 17016]) Input IDs shape: torch.Size([1, 17016]) Labels shape: torch.Size([1, 17016]) Final batch size: 1, sequence length: 12562 Attention mask shape: torch.Size([1, 1, 12562, 12562]) Position ids shape: torch.Size([1, 12562]) Input IDs shape: torch.Size([1, 12562]) Labels shape: torch.Size([1, 12562]) Final batch size: 1, sequence length: 14833 Attention mask shape: torch.Size([1, 1, 14833, 14833]) Position ids shape: torch.Size([1, 14833]) Input IDs shape: torch.Size([1, 14833]) Labels shape: torch.Size([1, 14833]) Final batch size: 1, sequence length: 12960 Attention mask shape: torch.Size([1, 1, 12960, 12960]) Position ids shape: torch.Size([1, 12960]) Input IDs shape: torch.Size([1, 12960]) Labels shape: torch.Size([1, 12960]) Final batch size: 1, sequence length: 15673 Attention mask shape: torch.Size([1, 1, 15673, 15673]) Position ids shape: torch.Size([1, 15673]) Input IDs shape: torch.Size([1, 15673]) Labels shape: torch.Size([1, 15673]) Final batch size: 1, sequence length: 15816 Attention mask shape: torch.Size([1, 1, 15816, 15816]) Position ids shape: torch.Size([1, 15816]) Input IDs shape: torch.Size([1, 15816]) Labels shape: torch.Size([1, 15816]) Final batch size: 1, sequence length: 16145 Attention mask shape: torch.Size([1, 1, 16145, 16145]) Position ids shape: torch.Size([1, 16145]) Input IDs shape: torch.Size([1, 16145]) Labels shape: torch.Size([1, 16145]) Final batch size: 1, sequence length: 17512 Attention mask shape: torch.Size([1, 1, 17512, 17512]) Position ids shape: torch.Size([1, 17512]) Input IDs shape: torch.Size([1, 17512]) Labels shape: torch.Size([1, 17512]) Final batch size: 1, sequence length: 18922 Attention mask shape: torch.Size([1, 1, 18922, 18922]) Position ids shape: torch.Size([1, 18922]) Input IDs shape: torch.Size([1, 18922]) Labels shape: torch.Size([1, 18922]) Final batch size: 1, sequence length: 19409 Attention mask shape: torch.Size([1, 1, 19409, 19409]) Position ids shape: torch.Size([1, 19409]) Input IDs shape: torch.Size([1, 19409]) Labels shape: torch.Size([1, 19409]) Final batch size: 1, sequence length: 17988 Attention mask shape: torch.Size([1, 1, 17988, 17988]) Position ids shape: torch.Size([1, 17988]) Input IDs shape: torch.Size([1, 17988]) Labels shape: torch.Size([1, 17988]) Final batch size: 1, sequence length: 20630 Attention mask shape: torch.Size([1, 1, 20630, 20630]) Position ids shape: torch.Size([1, 20630]) Input IDs shape: torch.Size([1, 20630]) Labels shape: torch.Size([1, 20630]) Final batch size: 1, sequence length: 17026 Attention mask shape: torch.Size([1, 1, 17026, 17026]) Position ids shape: torch.Size([1, 17026]) Input IDs shape: torch.Size([1, 17026]) Labels shape: torch.Size([1, 17026]) Final batch size: 1, sequence length: 19225 Attention mask shape: torch.Size([1, 1, 19225, 19225]) Position ids shape: torch.Size([1, 19225]) Input IDs shape: torch.Size([1, 19225]) Labels shape: torch.Size([1, 19225]) Final batch size: 1, sequence length: 10494 Attention mask shape: torch.Size([1, 1, 10494, 10494]) Position ids shape: torch.Size([1, 10494]) Input IDs shape: torch.Size([1, 10494]) Labels shape: torch.Size([1, 10494]) Final batch size: 1, sequence length: 20585 Attention mask shape: torch.Size([1, 1, 20585, 20585]) Position ids shape: torch.Size([1, 20585]) Input IDs shape: torch.Size([1, 20585]) Labels shape: torch.Size([1, 20585]) Final batch size: 1, sequence length: 18978 Attention mask shape: torch.Size([1, 1, 18978, 18978]) Position ids shape: torch.Size([1, 18978]) Input IDs shape: torch.Size([1, 18978]) Labels shape: torch.Size([1, 18978]) Final batch size: 1, sequence length: 20389 Attention mask shape: torch.Size([1, 1, 20389, 20389]) Position ids shape: torch.Size([1, 20389]) Input IDs shape: torch.Size([1, 20389]) Labels shape: torch.Size([1, 20389]) Final batch size: 1, sequence length: 12969 Attention mask shape: torch.Size([1, 1, 12969, 12969]) Position ids shape: torch.Size([1, 12969]) Input IDs shape: torch.Size([1, 12969]) Labels shape: torch.Size([1, 12969]) Final batch size: 1, sequence length: 21598 Attention mask shape: torch.Size([1, 1, 21598, 21598]) Position ids shape: torch.Size([1, 21598]) Input IDs shape: torch.Size([1, 21598]) Labels shape: torch.Size([1, 21598]) Final batch size: 1, sequence length: 22949 Attention mask shape: torch.Size([1, 1, 22949, 22949]) Position ids shape: torch.Size([1, 22949]) Input IDs shape: torch.Size([1, 22949]) Labels shape: torch.Size([1, 22949]) Final batch size: 1, sequence length: 19325 Attention mask shape: torch.Size([1, 1, 19325, 19325]) Position ids shape: torch.Size([1, 19325]) Input IDs shape: torch.Size([1, 19325]) Labels shape: torch.Size([1, 19325]) Final batch size: 1, sequence length: 21075 Attention mask shape: torch.Size([1, 1, 21075, 21075]) Position ids shape: torch.Size([1, 21075]) Input IDs shape: torch.Size([1, 21075]) Labels shape: torch.Size([1, 21075]) Final batch size: 1, sequence length: 20465 Attention mask shape: torch.Size([1, 1, 20465, 20465]) Position ids shape: torch.Size([1, 20465]) Input IDs shape: torch.Size([1, 20465]) Labels shape: torch.Size([1, 20465]) Final batch size: 1, sequence length: 15222 Attention mask shape: torch.Size([1, 1, 15222, 15222]) Position ids shape: torch.Size([1, 15222]) Input IDs shape: torch.Size([1, 15222]) Labels shape: torch.Size([1, 15222]) Final batch size: 1, sequence length: 20784 Attention mask shape: torch.Size([1, 1, 20784, 20784]) Position ids shape: torch.Size([1, 20784]) Input IDs shape: torch.Size([1, 20784]) Labels shape: torch.Size([1, 20784]) Final batch size: 1, sequence length: 20991 Attention mask shape: torch.Size([1, 1, 20991, 20991]) Position ids shape: torch.Size([1, 20991]) Input IDs shape: torch.Size([1, 20991]) Labels shape: torch.Size([1, 20991]) Final batch size: 1, sequence length: 14577 Attention mask shape: torch.Size([1, 1, 14577, 14577]) Position ids shape: torch.Size([1, 14577]) Input IDs shape: torch.Size([1, 14577]) Labels shape: torch.Size([1, 14577]) Final batch size: 1, sequence length: 23437 Attention mask shape: torch.Size([1, 1, 23437, 23437]) Position ids shape: torch.Size([1, 23437]) Input IDs shape: torch.Size([1, 23437]) Labels shape: torch.Size([1, 23437]) Final batch size: 1, sequence length: 6618 Attention mask shape: torch.Size([1, 1, 6618, 6618]) Position ids shape: torch.Size([1, 6618]) Input IDs shape: torch.Size([1, 6618]) Labels shape: torch.Size([1, 6618]) Final batch size: 1, sequence length: 13072 Attention mask shape: torch.Size([1, 1, 13072, 13072]) Position ids shape: torch.Size([1, 13072]) Input IDs shape: torch.Size([1, 13072]) Labels shape: torch.Size([1, 13072]) Final batch size: 1, sequence length: 23442 Attention mask shape: torch.Size([1, 1, 23442, 23442]) Position ids shape: torch.Size([1, 23442]) Input IDs shape: torch.Size([1, 23442]) Labels shape: torch.Size([1, 23442]) Final batch size: 1, sequence length: 23293 Attention mask shape: torch.Size([1, 1, 23293, 23293]) Position ids shape: torch.Size([1, 23293]) Input IDs shape: torch.Size([1, 23293]) Labels shape: torch.Size([1, 23293]) Final batch size: 1, sequence length: 3010 Attention mask shape: torch.Size([1, 1, 3010, 3010]) Position ids shape: torch.Size([1, 3010]) Input IDs shape: torch.Size([1, 3010]) Labels shape: torch.Size([1, 3010]) Final batch size: 1, sequence length: 21322 Attention mask shape: torch.Size([1, 1, 21322, 21322]) Position ids shape: torch.Size([1, 21322]) Input IDs shape: torch.Size([1, 21322]) Labels shape: torch.Size([1, 21322]) Final batch size: 1, sequence length: 22778 Attention mask shape: torch.Size([1, 1, 22778, 22778]) Position ids shape: torch.Size([1, 22778]) Input IDs shape: torch.Size([1, 22778]) Labels shape: torch.Size([1, 22778]) Final batch size: 1, sequence length: 23805 Attention mask shape: torch.Size([1, 1, 23805, 23805]) Position ids shape: torch.Size([1, 23805]) Input IDs shape: torch.Size([1, 23805]) Labels shape: torch.Size([1, 23805]) Final batch size: 1, sequence length: 19724 Attention mask shape: torch.Size([1, 1, 19724, 19724]) Position ids shape: torch.Size([1, 19724]) Input IDs shape: torch.Size([1, 19724]) Labels shape: torch.Size([1, 19724]) Final batch size: 1, sequence length: 21907 Attention mask shape: torch.Size([1, 1, 21907, 21907]) Position ids shape: torch.Size([1, 21907]) Input IDs shape: torch.Size([1, 21907]) Labels shape: torch.Size([1, 21907]) Final batch size: 1, sequence length: 27384 Attention mask shape: torch.Size([1, 1, 27384, 27384]) Position ids shape: torch.Size([1, 27384]) Input IDs shape: torch.Size([1, 27384]) Labels shape: torch.Size([1, 27384]) Final batch size: 1, sequence length: 23755 Attention mask shape: torch.Size([1, 1, 23755, 23755]) Position ids shape: torch.Size([1, 23755]) Input IDs shape: torch.Size([1, 23755]) Labels shape: torch.Size([1, 23755]) Final batch size: 1, sequence length: 28012 Attention mask shape: torch.Size([1, 1, 28012, 28012]) Position ids shape: torch.Size([1, 28012]) Input IDs shape: torch.Size([1, 28012]) Labels shape: torch.Size([1, 28012]) Final batch size: 1, sequence length: 14548 Attention mask shape: torch.Size([1, 1, 14548, 14548]) Position ids shape: torch.Size([1, 14548]) Input IDs shape: torch.Size([1, 14548]) Labels shape: torch.Size([1, 14548]) Final batch size: 1, sequence length: 24107 Attention mask shape: torch.Size([1, 1, 24107, 24107]) Position ids shape: torch.Size([1, 24107]) Input IDs shape: torch.Size([1, 24107]) Labels shape: torch.Size([1, 24107]) Final batch size: 1, sequence length: 21853 Attention mask shape: torch.Size([1, 1, 21853, 21853]) Position ids shape: torch.Size([1, 21853]) Input IDs shape: torch.Size([1, 21853]) Labels shape: torch.Size([1, 21853]) Final batch size: 1, sequence length: 27520 Attention mask shape: torch.Size([1, 1, 27520, 27520]) Position ids shape: torch.Size([1, 27520]) Input IDs shape: torch.Size([1, 27520]) Labels shape: torch.Size([1, 27520]) Final batch size: 1, sequence length: 26424 Attention mask shape: torch.Size([1, 1, 26424, 26424]) Position ids shape: torch.Size([1, 26424]) Input IDs shape: torch.Size([1, 26424]) Labels shape: torch.Size([1, 26424]) Final batch size: 1, sequence length: 25934 Attention mask shape: torch.Size([1, 1, 25934, 25934]) Position ids shape: torch.Size([1, 25934]) Input IDs shape: torch.Size([1, 25934]) Labels shape: torch.Size([1, 25934]) Final batch size: 1, sequence length: 14137 Attention mask shape: torch.Size([1, 1, 14137, 14137]) Position ids shape: torch.Size([1, 14137]) Input IDs shape: torch.Size([1, 14137]) Labels shape: torch.Size([1, 14137]) Final batch size: 1, sequence length: 28474 Attention mask shape: torch.Size([1, 1, 28474, 28474]) Position ids shape: torch.Size([1, 28474]) Input IDs shape: torch.Size([1, 28474]) Labels shape: torch.Size([1, 28474]) Final batch size: 1, sequence length: 24111 Attention mask shape: torch.Size([1, 1, 24111, 24111]) Position ids shape: torch.Size([1, 24111]) Input IDs shape: torch.Size([1, 24111]) Labels shape: torch.Size([1, 24111]) Final batch size: 1, sequence length: 26500 Attention mask shape: torch.Size([1, 1, 26500, 26500]) Position ids shape: torch.Size([1, 26500]) Input IDs shape: torch.Size([1, 26500]) Labels shape: torch.Size([1, 26500]) Final batch size: 1, sequence length: 26454 Attention mask shape: torch.Size([1, 1, 26454, 26454]) Position ids shape: torch.Size([1, 26454]) Input IDs shape: torch.Size([1, 26454]) Labels shape: torch.Size([1, 26454]) Final batch size: 1, sequence length: 22851 Attention mask shape: torch.Size([1, 1, 22851, 22851]) Position ids shape: torch.Size([1, 22851]) Input IDs shape: torch.Size([1, 22851]) Labels shape: torch.Size([1, 22851]) Final batch size: 1, sequence length: 30235 Attention mask shape: torch.Size([1, 1, 30235, 30235]) Position ids shape: torch.Size([1, 30235]) Input IDs shape: torch.Size([1, 30235]) Labels shape: torch.Size([1, 30235]) Final batch size: 1, sequence length: 19027 Attention mask shape: torch.Size([1, 1, 19027, 19027]) Position ids shape: torch.Size([1, 19027]) Input IDs shape: torch.Size([1, 19027]) Labels shape: torch.Size([1, 19027]) Final batch size: 1, sequence length: 20314 Attention mask shape: torch.Size([1, 1, 20314, 20314]) Position ids shape: torch.Size([1, 20314]) Input IDs shape: torch.Size([1, 20314]) Labels shape: torch.Size([1, 20314]) Final batch size: 1, sequence length: 24364 Attention mask shape: torch.Size([1, 1, 24364, 24364]) Position ids shape: torch.Size([1, 24364]) Input IDs shape: torch.Size([1, 24364]) Labels shape: torch.Size([1, 24364]) Final batch size: 1, sequence length: 12519 Attention mask shape: torch.Size([1, 1, 12519, 12519]) Position ids shape: torch.Size([1, 12519]) Input IDs shape: torch.Size([1, 12519]) Labels shape: torch.Size([1, 12519]) Final batch size: 1, sequence length: 20028 Attention mask shape: torch.Size([1, 1, 20028, 20028]) Position ids shape: torch.Size([1, 20028]) Input IDs shape: torch.Size([1, 20028]) Labels shape: torch.Size([1, 20028]) Final batch size: 1, sequence length: 31404 Attention mask shape: torch.Size([1, 1, 31404, 31404]) Position ids shape: torch.Size([1, 31404]) Input IDs shape: torch.Size([1, 31404]) Labels shape: torch.Size([1, 31404]) Final batch size: 1, sequence length: 11662 Attention mask shape: torch.Size([1, 1, 11662, 11662]) Position ids shape: torch.Size([1, 11662]) Input IDs shape: torch.Size([1, 11662]) Labels shape: torch.Size([1, 11662]) Final batch size: 1, sequence length: 22920 Attention mask shape: torch.Size([1, 1, 22920, 22920]) Position ids shape: torch.Size([1, 22920]) Input IDs shape: torch.Size([1, 22920]) Labels shape: torch.Size([1, 22920]) Final batch size: 1, sequence length: 12234 Attention mask shape: torch.Size([1, 1, 12234, 12234]) Position ids shape: torch.Size([1, 12234]) Input IDs shape: torch.Size([1, 12234]) Labels shape: torch.Size([1, 12234]) Final batch size: 1, sequence length: 24716 Attention mask shape: torch.Size([1, 1, 24716, 24716]) Position ids shape: torch.Size([1, 24716]) Input IDs shape: torch.Size([1, 24716]) Labels shape: torch.Size([1, 24716]) Final batch size: 1, sequence length: 23855 Attention mask shape: torch.Size([1, 1, 23855, 23855]) Position ids shape: torch.Size([1, 23855]) Input IDs shape: torch.Size([1, 23855]) Labels shape: torch.Size([1, 23855]) Final batch size: 1, sequence length: 23695 Attention mask shape: torch.Size([1, 1, 23695, 23695]) Position ids shape: torch.Size([1, 23695]) Input IDs shape: torch.Size([1, 23695]) Labels shape: torch.Size([1, 23695]) Final batch size: 1, sequence length: 29842 Attention mask shape: torch.Size([1, 1, 29842, 29842]) Position ids shape: torch.Size([1, 29842]) Input IDs shape: torch.Size([1, 29842]) Labels shape: torch.Size([1, 29842]) Final batch size: 1, sequence length: 29385 Attention mask shape: torch.Size([1, 1, 29385, 29385]) Position ids shape: torch.Size([1, 29385]) Input IDs shape: torch.Size([1, 29385]) Labels shape: torch.Size([1, 29385]) Final batch size: 1, sequence length: 34169 Attention mask shape: torch.Size([1, 1, 34169, 34169]) Position ids shape: torch.Size([1, 34169]) Input IDs shape: torch.Size([1, 34169]) Labels shape: torch.Size([1, 34169]) Final batch size: 1, sequence length: 21168 Attention mask shape: torch.Size([1, 1, 21168, 21168]) Position ids shape: torch.Size([1, 21168]) Input IDs shape: torch.Size([1, 21168]) Labels shape: torch.Size([1, 21168]) Final batch size: 1, sequence length: 33179 Attention mask shape: torch.Size([1, 1, 33179, 33179]) Position ids shape: torch.Size([1, 33179]) Input IDs shape: torch.Size([1, 33179]) Labels shape: torch.Size([1, 33179]) Final batch size: 1, sequence length: 27107 Attention mask shape: torch.Size([1, 1, 27107, 27107]) Position ids shape: torch.Size([1, 27107]) Input IDs shape: torch.Size([1, 27107]) Labels shape: torch.Size([1, 27107]) Final batch size: 1, sequence length: 30041 Attention mask shape: torch.Size([1, 1, 30041, 30041]) Position ids shape: torch.Size([1, 30041]) Input IDs shape: torch.Size([1, 30041]) Labels shape: torch.Size([1, 30041]) Final batch size: 1, sequence length: 33765 Attention mask shape: torch.Size([1, 1, 33765, 33765]) Position ids shape: torch.Size([1, 33765]) Input IDs shape: torch.Size([1, 33765]) Labels shape: torch.Size([1, 33765]) Final batch size: 1, sequence length: 24830 Attention mask shape: torch.Size([1, 1, 24830, 24830]) Position ids shape: torch.Size([1, 24830]) Input IDs shape: torch.Size([1, 24830]) Labels shape: torch.Size([1, 24830]) Final batch size: 1, sequence length: 24923 Attention mask shape: torch.Size([1, 1, 24923, 24923]) Position ids shape: torch.Size([1, 24923]) Input IDs shape: torch.Size([1, 24923]) Labels shape: torch.Size([1, 24923]) Final batch size: 1, sequence length: 17763 Attention mask shape: torch.Size([1, 1, 17763, 17763]) Position ids shape: torch.Size([1, 17763]) Input IDs shape: torch.Size([1, 17763]) Labels shape: torch.Size([1, 17763]) Final batch size: 1, sequence length: 29743 Attention mask shape: torch.Size([1, 1, 29743, 29743]) Position ids shape: torch.Size([1, 29743]) Input IDs shape: torch.Size([1, 29743]) Labels shape: torch.Size([1, 29743]) Final batch size: 1, sequence length: 33321 Attention mask shape: torch.Size([1, 1, 33321, 33321]) Position ids shape: torch.Size([1, 33321]) Input IDs shape: torch.Size([1, 33321]) Labels shape: torch.Size([1, 33321]) Final batch size: 1, sequence length: 10139 Attention mask shape: torch.Size([1, 1, 10139, 10139]) Position ids shape: torch.Size([1, 10139]) Input IDs shape: torch.Size([1, 10139]) Labels shape: torch.Size([1, 10139]) Final batch size: 1, sequence length: 24187 Attention mask shape: torch.Size([1, 1, 24187, 24187]) Position ids shape: torch.Size([1, 24187]) Input IDs shape: torch.Size([1, 24187]) Labels shape: torch.Size([1, 24187]) Final batch size: 1, sequence length: 19302 Attention mask shape: torch.Size([1, 1, 19302, 19302]) Position ids shape: torch.Size([1, 19302]) Input IDs shape: torch.Size([1, 19302]) Labels shape: torch.Size([1, 19302]) Final batch size: 1, sequence length: 37657 Final batch size: 1, sequence length: 38576 Attention mask shape: torch.Size([1, 1, 37657, 37657]) Position ids shape: torch.Size([1, 37657]) Input IDs shape: torch.Size([1, 37657]) Labels shape: torch.Size([1, 37657]) Attention mask shape: torch.Size([1, 1, 38576, 38576]) Position ids shape: torch.Size([1, 38576]) Input IDs shape: torch.Size([1, 38576]) Labels shape: torch.Size([1, 38576]) Final batch size: 1, sequence length: 25990 Attention mask shape: torch.Size([1, 1, 25990, 25990]) Position ids shape: torch.Size([1, 25990]) Input IDs shape: torch.Size([1, 25990]) Labels shape: torch.Size([1, 25990]) Final batch size: 1, sequence length: 26696 Attention mask shape: torch.Size([1, 1, 26696, 26696]) Position ids shape: torch.Size([1, 26696]) Input IDs shape: torch.Size([1, 26696]) Labels shape: torch.Size([1, 26696]) Final batch size: 1, sequence length: 40006 Attention mask shape: torch.Size([1, 1, 40006, 40006]) Position ids shape: torch.Size([1, 40006]) Input IDs shape: torch.Size([1, 40006]) Labels shape: torch.Size([1, 40006]) Final batch size: 1, sequence length: 35555 Attention mask shape: torch.Size([1, 1, 35555, 35555]) Position ids shape: torch.Size([1, 35555]) Input IDs shape: torch.Size([1, 35555]) Labels shape: torch.Size([1, 35555]) Final batch size: 1, sequence length: 27738 Attention mask shape: torch.Size([1, 1, 27738, 27738]) Position ids shape: torch.Size([1, 27738]) Input IDs shape: torch.Size([1, 27738]) Labels shape: torch.Size([1, 27738]) Final batch size: 1, sequence length: 29286 Attention mask shape: torch.Size([1, 1, 29286, 29286]) Position ids shape: torch.Size([1, 29286]) Input IDs shape: torch.Size([1, 29286]) Labels shape: torch.Size([1, 29286]) Final batch size: 1, sequence length: 32217 Attention mask shape: torch.Size([1, 1, 32217, 32217]) Position ids shape: torch.Size([1, 32217]) Input IDs shape: torch.Size([1, 32217]) Labels shape: torch.Size([1, 32217]) Final batch size: 1, sequence length: 29085 Attention mask shape: torch.Size([1, 1, 29085, 29085]) Position ids shape: torch.Size([1, 29085]) Input IDs shape: torch.Size([1, 29085]) Labels shape: torch.Size([1, 29085]) Final batch size: 1, sequence length: 26685 Attention mask shape: torch.Size([1, 1, 26685, 26685]) Position ids shape: torch.Size([1, 26685]) Input IDs shape: torch.Size([1, 26685]) Labels shape: torch.Size([1, 26685]) Final batch size: 1, sequence length: 18734 Attention mask shape: torch.Size([1, 1, 18734, 18734]) Position ids shape: torch.Size([1, 18734]) Input IDs shape: torch.Size([1, 18734]) Labels shape: torch.Size([1, 18734]) Final batch size: 1, sequence length: 35949 Attention mask shape: torch.Size([1, 1, 35949, 35949]) Position ids shape: torch.Size([1, 35949]) Input IDs shape: torch.Size([1, 35949]) Labels shape: torch.Size([1, 35949]) Final batch size: 1, sequence length: 23848 Attention mask shape: torch.Size([1, 1, 23848, 23848]) Position ids shape: torch.Size([1, 23848]) Input IDs shape: torch.Size([1, 23848]) Labels shape: torch.Size([1, 23848]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 31991 Attention mask shape: torch.Size([1, 1, 31991, 31991]) Position ids shape: torch.Size([1, 31991]) Input IDs shape: torch.Size([1, 31991]) Labels shape: torch.Size([1, 31991]) Final batch size: 1, sequence length: 31115 Attention mask shape: torch.Size([1, 1, 31115, 31115]) Position ids shape: torch.Size([1, 31115]) Input IDs shape: torch.Size([1, 31115]) Labels shape: torch.Size([1, 31115]) Final batch size: 1, sequence length: 39018 Attention mask shape: torch.Size([1, 1, 39018, 39018]) Position ids shape: torch.Size([1, 39018]) Input IDs shape: torch.Size([1, 39018]) Labels shape: torch.Size([1, 39018]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 22500 Attention mask shape: torch.Size([1, 1, 22500, 22500]) Position ids shape: torch.Size([1, 22500]) Input IDs shape: torch.Size([1, 22500]) Labels shape: torch.Size([1, 22500]) Final batch size: 1, sequence length: 19138 Attention mask shape: torch.Size([1, 1, 19138, 19138]) Position ids shape: torch.Size([1, 19138]) Input IDs shape: torch.Size([1, 19138]) Labels shape: torch.Size([1, 19138]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 34628 Attention mask shape: torch.Size([1, 1, 34628, 34628]) Position ids shape: torch.Size([1, 34628]) Input IDs shape: torch.Size([1, 34628]) Labels shape: torch.Size([1, 34628]) Final batch size: 1, sequence length: 32762 Attention mask shape: torch.Size([1, 1, 32762, 32762]) Position ids shape: torch.Size([1, 32762]) Input IDs shape: torch.Size([1, 32762]) Labels shape: torch.Size([1, 32762]) Final batch size: 1, sequence length: 40834 Attention mask shape: torch.Size([1, 1, 40834, 40834]) Position ids shape: torch.Size([1, 40834]) Input IDs shape: torch.Size([1, 40834]) Labels shape: torch.Size([1, 40834]) Final batch size: 1, sequence length: 37680 Attention mask shape: torch.Size([1, 1, 37680, 37680]) Position ids shape: torch.Size([1, 37680]) Input IDs shape: torch.Size([1, 37680]) Labels shape: torch.Size([1, 37680]) Final batch size: 1, sequence length: 22778 Attention mask shape: torch.Size([1, 1, 22778, 22778]) Position ids shape: torch.Size([1, 22778]) Input IDs shape: torch.Size([1, 22778]) Labels shape: torch.Size([1, 22778]) Final batch size: 1, sequence length: 32675 Attention mask shape: torch.Size([1, 1, 32675, 32675]) Position ids shape: torch.Size([1, 32675]) Input IDs shape: torch.Size([1, 32675]) Labels shape: torch.Size([1, 32675]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 26372 Attention mask shape: torch.Size([1, 1, 26372, 26372]) Position ids shape: torch.Size([1, 26372]) Input IDs shape: torch.Size([1, 26372]) Labels shape: torch.Size([1, 26372]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 34791 Attention mask shape: torch.Size([1, 1, 34791, 34791]) Position ids shape: torch.Size([1, 34791]) Input IDs shape: torch.Size([1, 34791]) Labels shape: torch.Size([1, 34791]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) {'loss': 0.3718, 'grad_norm': 1.4362038538670252, 'learning_rate': 9.972609476841368e-06, 'num_tokens': -inf, 'epoch': 0.88} Final batch size: 1, sequence length: 5686 Attention mask shape: torch.Size([1, 1, 5686, 5686]) Position ids shape: torch.Size([1, 5686]) Input IDs shape: torch.Size([1, 5686]) Labels shape: torch.Size([1, 5686]) Final batch size: 1, sequence length: 6986 Attention mask shape: torch.Size([1, 1, 6986, 6986]) Position ids shape: torch.Size([1, 6986]) Input IDs shape: torch.Size([1, 6986]) Labels shape: torch.Size([1, 6986]) Final batch size: 1, sequence length: 11304 Attention mask shape: torch.Size([1, 1, 11304, 11304]) Position ids shape: torch.Size([1, 11304]) Input IDs shape: torch.Size([1, 11304]) Labels shape: torch.Size([1, 11304]) Final batch size: 1, sequence length: 12847 Attention mask shape: torch.Size([1, 1, 12847, 12847]) Position ids shape: torch.Size([1, 12847]) Input IDs shape: torch.Size([1, 12847]) Labels shape: torch.Size([1, 12847]) Final batch size: 1, sequence length: 13224 Attention mask shape: torch.Size([1, 1, 13224, 13224]) Position ids shape: torch.Size([1, 13224]) Input IDs shape: torch.Size([1, 13224]) Labels shape: torch.Size([1, 13224]) Final batch size: 1, sequence length: 14168 Attention mask shape: torch.Size([1, 1, 14168, 14168]) Position ids shape: torch.Size([1, 14168]) Input IDs shape: torch.Size([1, 14168]) Labels shape: torch.Size([1, 14168]) Final batch size: 1, sequence length: 13872 Attention mask shape: torch.Size([1, 1, 13872, 13872]) Position ids shape: torch.Size([1, 13872]) Input IDs shape: torch.Size([1, 13872]) Labels shape: torch.Size([1, 13872]) Final batch size: 1, sequence length: 14304 Attention mask shape: torch.Size([1, 1, 14304, 14304]) Position ids shape: torch.Size([1, 14304]) Input IDs shape: torch.Size([1, 14304]) Labels shape: torch.Size([1, 14304]) Final batch size: 1, sequence length: 15621 Attention mask shape: torch.Size([1, 1, 15621, 15621]) Position ids shape: torch.Size([1, 15621]) Input IDs shape: torch.Size([1, 15621]) Labels shape: torch.Size([1, 15621]) Final batch size: 1, sequence length: 16440 Attention mask shape: torch.Size([1, 1, 16440, 16440]) Position ids shape: torch.Size([1, 16440]) Input IDs shape: torch.Size([1, 16440]) Labels shape: torch.Size([1, 16440]) Final batch size: 1, sequence length: 16501 Attention mask shape: torch.Size([1, 1, 16501, 16501]) Position ids shape: torch.Size([1, 16501]) Input IDs shape: torch.Size([1, 16501]) Labels shape: torch.Size([1, 16501]) Final batch size: 1, sequence length: 16514 Attention mask shape: torch.Size([1, 1, 16514, 16514]) Position ids shape: torch.Size([1, 16514]) Input IDs shape: torch.Size([1, 16514]) Labels shape: torch.Size([1, 16514]) Final batch size: 1, sequence length: 17773 Attention mask shape: torch.Size([1, 1, 17773, 17773]) Position ids shape: torch.Size([1, 17773]) Input IDs shape: torch.Size([1, 17773]) Labels shape: torch.Size([1, 17773]) Final batch size: 1, sequence length: 15652 Attention mask shape: torch.Size([1, 1, 15652, 15652]) Position ids shape: torch.Size([1, 15652]) Input IDs shape: torch.Size([1, 15652]) Labels shape: torch.Size([1, 15652]) Final batch size: 1, sequence length: 17213 Attention mask shape: torch.Size([1, 1, 17213, 17213]) Position ids shape: torch.Size([1, 17213]) Input IDs shape: torch.Size([1, 17213]) Labels shape: torch.Size([1, 17213]) Final batch size: 1, sequence length: 18044 Attention mask shape: torch.Size([1, 1, 18044, 18044]) Position ids shape: torch.Size([1, 18044]) Input IDs shape: torch.Size([1, 18044]) Labels shape: torch.Size([1, 18044]) Final batch size: 1, sequence length: 17852 Attention mask shape: torch.Size([1, 1, 17852, 17852]) Position ids shape: torch.Size([1, 17852]) Input IDs shape: torch.Size([1, 17852]) Labels shape: torch.Size([1, 17852]) Final batch size: 1, sequence length: 16378 Attention mask shape: torch.Size([1, 1, 16378, 16378]) Position ids shape: torch.Size([1, 16378]) Input IDs shape: torch.Size([1, 16378]) Labels shape: torch.Size([1, 16378]) Final batch size: 1, sequence length: 21163 Attention mask shape: torch.Size([1, 1, 21163, 21163]) Position ids shape: torch.Size([1, 21163]) Input IDs shape: torch.Size([1, 21163]) Labels shape: torch.Size([1, 21163]) Final batch size: 1, sequence length: 18246 Attention mask shape: torch.Size([1, 1, 18246, 18246]) Position ids shape: torch.Size([1, 18246]) Input IDs shape: torch.Size([1, 18246]) Labels shape: torch.Size([1, 18246]) Final batch size: 1, sequence length: 18155 Attention mask shape: torch.Size([1, 1, 18155, 18155]) Position ids shape: torch.Size([1, 18155]) Input IDs shape: torch.Size([1, 18155]) Labels shape: torch.Size([1, 18155]) Final batch size: 1, sequence length: 18289 Attention mask shape: torch.Size([1, 1, 18289, 18289]) Position ids shape: torch.Size([1, 18289]) Input IDs shape: torch.Size([1, 18289]) Labels shape: torch.Size([1, 18289]) Final batch size: 1, sequence length: 21562 Attention mask shape: torch.Size([1, 1, 21562, 21562]) Position ids shape: torch.Size([1, 21562]) Input IDs shape: torch.Size([1, 21562]) Labels shape: torch.Size([1, 21562]) Final batch size: 1, sequence length: 21207 Attention mask shape: torch.Size([1, 1, 21207, 21207]) Position ids shape: torch.Size([1, 21207]) Input IDs shape: torch.Size([1, 21207]) Labels shape: torch.Size([1, 21207]) Final batch size: 1, sequence length: 20176 Attention mask shape: torch.Size([1, 1, 20176, 20176]) Position ids shape: torch.Size([1, 20176]) Input IDs shape: torch.Size([1, 20176]) Labels shape: torch.Size([1, 20176]) Final batch size: 1, sequence length: 20766 Attention mask shape: torch.Size([1, 1, 20766, 20766]) Position ids shape: torch.Size([1, 20766]) Input IDs shape: torch.Size([1, 20766]) Labels shape: torch.Size([1, 20766]) Final batch size: 1, sequence length: 22784 Attention mask shape: torch.Size([1, 1, 22784, 22784]) Position ids shape: torch.Size([1, 22784]) Input IDs shape: torch.Size([1, 22784]) Labels shape: torch.Size([1, 22784]) Final batch size: 1, sequence length: 21316 Attention mask shape: torch.Size([1, 1, 21316, 21316]) Position ids shape: torch.Size([1, 21316]) Input IDs shape: torch.Size([1, 21316]) Labels shape: torch.Size([1, 21316]) Final batch size: 1, sequence length: 20813 Attention mask shape: torch.Size([1, 1, 20813, 20813]) Position ids shape: torch.Size([1, 20813]) Input IDs shape: torch.Size([1, 20813]) Labels shape: torch.Size([1, 20813]) Final batch size: 1, sequence length: 22152 Attention mask shape: torch.Size([1, 1, 22152, 22152]) Position ids shape: torch.Size([1, 22152]) Input IDs shape: torch.Size([1, 22152]) Labels shape: torch.Size([1, 22152]) Final batch size: 1, sequence length: 22117 Attention mask shape: torch.Size([1, 1, 22117, 22117]) Position ids shape: torch.Size([1, 22117]) Input IDs shape: torch.Size([1, 22117]) Labels shape: torch.Size([1, 22117]) Final batch size: 1, sequence length: 21607 Attention mask shape: torch.Size([1, 1, 21607, 21607]) Position ids shape: torch.Size([1, 21607]) Input IDs shape: torch.Size([1, 21607]) Labels shape: torch.Size([1, 21607]) Final batch size: 1, sequence length: 22197 Attention mask shape: torch.Size([1, 1, 22197, 22197]) Position ids shape: torch.Size([1, 22197]) Input IDs shape: torch.Size([1, 22197]) Labels shape: torch.Size([1, 22197]) Final batch size: 1, sequence length: 25880 Attention mask shape: torch.Size([1, 1, 25880, 25880]) Position ids shape: torch.Size([1, 25880]) Input IDs shape: torch.Size([1, 25880]) Labels shape: torch.Size([1, 25880]) Final batch size: 1, sequence length: 21705 Attention mask shape: torch.Size([1, 1, 21705, 21705]) Position ids shape: torch.Size([1, 21705]) Input IDs shape: torch.Size([1, 21705]) Labels shape: torch.Size([1, 21705]) Final batch size: 1, sequence length: 25053 Attention mask shape: torch.Size([1, 1, 25053, 25053]) Position ids shape: torch.Size([1, 25053]) Input IDs shape: torch.Size([1, 25053]) Labels shape: torch.Size([1, 25053]) Final batch size: 1, sequence length: 26494 Attention mask shape: torch.Size([1, 1, 26494, 26494]) Position ids shape: torch.Size([1, 26494]) Input IDs shape: torch.Size([1, 26494]) Labels shape: torch.Size([1, 26494]) Final batch size: 1, sequence length: 30151 Attention mask shape: torch.Size([1, 1, 30151, 30151]) Position ids shape: torch.Size([1, 30151]) Input IDs shape: torch.Size([1, 30151]) Labels shape: torch.Size([1, 30151]) Final batch size: 1, sequence length: 26921 Attention mask shape: torch.Size([1, 1, 26921, 26921]) Position ids shape: torch.Size([1, 26921]) Input IDs shape: torch.Size([1, 26921]) Labels shape: torch.Size([1, 26921]) Final batch size: 1, sequence length: 27005 Attention mask shape: torch.Size([1, 1, 27005, 27005]) Position ids shape: torch.Size([1, 27005]) Input IDs shape: torch.Size([1, 27005]) Labels shape: torch.Size([1, 27005]) Final batch size: 1, sequence length: 28523 Attention mask shape: torch.Size([1, 1, 28523, 28523]) Position ids shape: torch.Size([1, 28523]) Input IDs shape: torch.Size([1, 28523]) Labels shape: torch.Size([1, 28523]) Final batch size: 1, sequence length: 26303 Attention mask shape: torch.Size([1, 1, 26303, 26303]) Position ids shape: torch.Size([1, 26303]) Input IDs shape: torch.Size([1, 26303]) Labels shape: torch.Size([1, 26303]) Final batch size: 1, sequence length: 30592 Attention mask shape: torch.Size([1, 1, 30592, 30592]) Position ids shape: torch.Size([1, 30592]) Input IDs shape: torch.Size([1, 30592]) Labels shape: torch.Size([1, 30592]) Final batch size: 1, sequence length: 32596 Attention mask shape: torch.Size([1, 1, 32596, 32596]) Position ids shape: torch.Size([1, 32596]) Input IDs shape: torch.Size([1, 32596]) Labels shape: torch.Size([1, 32596]) Final batch size: 1, sequence length: 33482 Attention mask shape: torch.Size([1, 1, 33482, 33482]) Position ids shape: torch.Size([1, 33482]) Input IDs shape: torch.Size([1, 33482]) Labels shape: torch.Size([1, 33482]) Final batch size: 1, sequence length: 32473 Attention mask shape: torch.Size([1, 1, 32473, 32473]) Position ids shape: torch.Size([1, 32473]) Input IDs shape: torch.Size([1, 32473]) Labels shape: torch.Size([1, 32473]) Final batch size: 1, sequence length: 35100 Attention mask shape: torch.Size([1, 1, 35100, 35100]) Position ids shape: torch.Size([1, 35100]) Input IDs shape: torch.Size([1, 35100]) Labels shape: torch.Size([1, 35100]) Final batch size: 1, sequence length: 28897 Attention mask shape: torch.Size([1, 1, 28897, 28897]) Position ids shape: torch.Size([1, 28897]) Input IDs shape: torch.Size([1, 28897]) Labels shape: torch.Size([1, 28897]) Final batch size: 1, sequence length: 34469 Attention mask shape: torch.Size([1, 1, 34469, 34469]) Position ids shape: torch.Size([1, 34469]) Input IDs shape: torch.Size([1, 34469]) Labels shape: torch.Size([1, 34469]) Final batch size: 1, sequence length: 35485 Attention mask shape: torch.Size([1, 1, 35485, 35485]) Position ids shape: torch.Size([1, 35485]) Input IDs shape: torch.Size([1, 35485]) Labels shape: torch.Size([1, 35485]) Final batch size: 1, sequence length: 35866 Attention mask shape: torch.Size([1, 1, 35866, 35866]) Position ids shape: torch.Size([1, 35866]) Input IDs shape: torch.Size([1, 35866]) Labels shape: torch.Size([1, 35866]) Final batch size: 1, sequence length: 35523 Attention mask shape: torch.Size([1, 1, 35523, 35523]) Position ids shape: torch.Size([1, 35523]) Input IDs shape: torch.Size([1, 35523]) Labels shape: torch.Size([1, 35523]) Final batch size: 1, sequence length: 35030 Attention mask shape: torch.Size([1, 1, 35030, 35030]) Position ids shape: torch.Size([1, 35030]) Input IDs shape: torch.Size([1, 35030]) Labels shape: torch.Size([1, 35030]) Final batch size: 1, sequence length: 37980 Attention mask shape: torch.Size([1, 1, 37980, 37980]) Position ids shape: torch.Size([1, 37980]) Input IDs shape: torch.Size([1, 37980]) Labels shape: torch.Size([1, 37980]) Final batch size: 1, sequence length: 40526 Attention mask shape: torch.Size([1, 1, 40526, 40526]) Position ids shape: torch.Size([1, 40526]) Input IDs shape: torch.Size([1, 40526]) Labels shape: torch.Size([1, 40526]) Final batch size: 1, sequence length: 39589 Attention mask shape: torch.Size([1, 1, 39589, 39589]) Position ids shape: torch.Size([1, 39589]) Input IDs shape: torch.Size([1, 39589]) Labels shape: torch.Size([1, 39589]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 37040 Attention mask shape: torch.Size([1, 1, 37040, 37040]) Position ids shape: torch.Size([1, 37040]) Input IDs shape: torch.Size([1, 37040]) Labels shape: torch.Size([1, 37040]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 38927 Attention mask shape: torch.Size([1, 1, 38927, 38927]) Position ids shape: torch.Size([1, 38927]) Input IDs shape: torch.Size([1, 38927]) Labels shape: torch.Size([1, 38927]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) {'loss': 0.3695, 'grad_norm': 1.3665289826973017, 'learning_rate': 9.938441702975689e-06, 'num_tokens': -inf, 'epoch': 1.0} Final batch size: 1, sequence length: 5686 Attention mask shape: torch.Size([1, 1, 5686, 5686]) Position ids shape: torch.Size([1, 5686]) Input IDs shape: torch.Size([1, 5686]) Labels shape: torch.Size([1, 5686]) Final batch size: 1, sequence length: 6986 Attention mask shape: torch.Size([1, 1, 6986, 6986]) Position ids shape: torch.Size([1, 6986]) Input IDs shape: torch.Size([1, 6986]) Labels shape: torch.Size([1, 6986]) Final batch size: 1, sequence length: 6362 Attention mask shape: torch.Size([1, 1, 6362, 6362]) Position ids shape: torch.Size([1, 6362]) Input IDs shape: torch.Size([1, 6362]) Labels shape: torch.Size([1, 6362]) Final batch size: 1, sequence length: 13224 Attention mask shape: torch.Size([1, 1, 13224, 13224]) Position ids shape: torch.Size([1, 13224]) Input IDs shape: torch.Size([1, 13224]) Labels shape: torch.Size([1, 13224]) Final batch size: 1, sequence length: 10523 Attention mask shape: torch.Size([1, 1, 10523, 10523]) Position ids shape: torch.Size([1, 10523]) Input IDs shape: torch.Size([1, 10523]) Labels shape: torch.Size([1, 10523]) Final batch size: 1, sequence length: 14304 Attention mask shape: torch.Size([1, 1, 14304, 14304]) Position ids shape: torch.Size([1, 14304]) Input IDs shape: torch.Size([1, 14304]) Labels shape: torch.Size([1, 14304]) Final batch size: 1, sequence length: 12847 Attention mask shape: torch.Size([1, 1, 12847, 12847]) Position ids shape: torch.Size([1, 12847]) Input IDs shape: torch.Size([1, 12847]) Labels shape: torch.Size([1, 12847]) Final batch size: 1, sequence length: 13872 Attention mask shape: torch.Size([1, 1, 13872, 13872]) Position ids shape: torch.Size([1, 13872]) Input IDs shape: torch.Size([1, 13872]) Labels shape: torch.Size([1, 13872]) Final batch size: 1, sequence length: 11304 Attention mask shape: torch.Size([1, 1, 11304, 11304]) Position ids shape: torch.Size([1, 11304]) Input IDs shape: torch.Size([1, 11304]) Labels shape: torch.Size([1, 11304]) Final batch size: 1, sequence length: 15621 Attention mask shape: torch.Size([1, 1, 15621, 15621]) Position ids shape: torch.Size([1, 15621]) Input IDs shape: torch.Size([1, 15621]) Labels shape: torch.Size([1, 15621]) Final batch size: 1, sequence length: 15652 Attention mask shape: torch.Size([1, 1, 15652, 15652]) Position ids shape: torch.Size([1, 15652]) Input IDs shape: torch.Size([1, 15652]) Labels shape: torch.Size([1, 15652]) Final batch size: 1, sequence length: 14168 Attention mask shape: torch.Size([1, 1, 14168, 14168]) Position ids shape: torch.Size([1, 14168]) Input IDs shape: torch.Size([1, 14168]) Labels shape: torch.Size([1, 14168]) Final batch size: 1, sequence length: 16514 Attention mask shape: torch.Size([1, 1, 16514, 16514]) Position ids shape: torch.Size([1, 16514]) Input IDs shape: torch.Size([1, 16514]) Labels shape: torch.Size([1, 16514]) Final batch size: 1, sequence length: 16378 Attention mask shape: torch.Size([1, 1, 16378, 16378]) Position ids shape: torch.Size([1, 16378]) Input IDs shape: torch.Size([1, 16378]) Labels shape: torch.Size([1, 16378]) Final batch size: 1, sequence length: 16440 Attention mask shape: torch.Size([1, 1, 16440, 16440]) Position ids shape: torch.Size([1, 16440]) Input IDs shape: torch.Size([1, 16440]) Labels shape: torch.Size([1, 16440]) Final batch size: 1, sequence length: 18044 Attention mask shape: torch.Size([1, 1, 18044, 18044]) Position ids shape: torch.Size([1, 18044]) Input IDs shape: torch.Size([1, 18044]) Labels shape: torch.Size([1, 18044]) Final batch size: 1, sequence length: 16501 Attention mask shape: torch.Size([1, 1, 16501, 16501]) Position ids shape: torch.Size([1, 16501]) Input IDs shape: torch.Size([1, 16501]) Labels shape: torch.Size([1, 16501]) Final batch size: 1, sequence length: 17213 Attention mask shape: torch.Size([1, 1, 17213, 17213]) Position ids shape: torch.Size([1, 17213]) Input IDs shape: torch.Size([1, 17213]) Labels shape: torch.Size([1, 17213]) Final batch size: 1, sequence length: 17852 Attention mask shape: torch.Size([1, 1, 17852, 17852]) Position ids shape: torch.Size([1, 17852]) Input IDs shape: torch.Size([1, 17852]) Labels shape: torch.Size([1, 17852]) Final batch size: 1, sequence length: 18155 Attention mask shape: torch.Size([1, 1, 18155, 18155]) Position ids shape: torch.Size([1, 18155]) Input IDs shape: torch.Size([1, 18155]) Labels shape: torch.Size([1, 18155]) Final batch size: 1, sequence length: 18289 Attention mask shape: torch.Size([1, 1, 18289, 18289]) Position ids shape: torch.Size([1, 18289]) Input IDs shape: torch.Size([1, 18289]) Labels shape: torch.Size([1, 18289]) Final batch size: 1, sequence length: 17773 Attention mask shape: torch.Size([1, 1, 17773, 17773]) Position ids shape: torch.Size([1, 17773]) Input IDs shape: torch.Size([1, 17773]) Labels shape: torch.Size([1, 17773]) Final batch size: 1, sequence length: 20176 Attention mask shape: torch.Size([1, 1, 20176, 20176]) Position ids shape: torch.Size([1, 20176]) Input IDs shape: torch.Size([1, 20176]) Labels shape: torch.Size([1, 20176]) Final batch size: 1, sequence length: 20766 Attention mask shape: torch.Size([1, 1, 20766, 20766]) Position ids shape: torch.Size([1, 20766]) Input IDs shape: torch.Size([1, 20766]) Labels shape: torch.Size([1, 20766]) Final batch size: 1, sequence length: 18246 Attention mask shape: torch.Size([1, 1, 18246, 18246]) Position ids shape: torch.Size([1, 18246]) Input IDs shape: torch.Size([1, 18246]) Labels shape: torch.Size([1, 18246]) Final batch size: 1, sequence length: 21562 Attention mask shape: torch.Size([1, 1, 21562, 21562]) Final batch size: 1, sequence length: 18545 Position ids shape: torch.Size([1, 21562]) Input IDs shape: torch.Size([1, 21562]) Labels shape: torch.Size([1, 21562]) Attention mask shape: torch.Size([1, 1, 18545, 18545]) Position ids shape: torch.Size([1, 18545]) Input IDs shape: torch.Size([1, 18545]) Labels shape: torch.Size([1, 18545]) Final batch size: 1, sequence length: 21705 Attention mask shape: torch.Size([1, 1, 21705, 21705]) Position ids shape: torch.Size([1, 21705]) Input IDs shape: torch.Size([1, 21705]) Labels shape: torch.Size([1, 21705]) Final batch size: 1, sequence length: 21607 Attention mask shape: torch.Size([1, 1, 21607, 21607]) Position ids shape: torch.Size([1, 21607]) Input IDs shape: torch.Size([1, 21607]) Labels shape: torch.Size([1, 21607]) Final batch size: 1, sequence length: 22152 Attention mask shape: torch.Size([1, 1, 22152, 22152]) Position ids shape: torch.Size([1, 22152]) Input IDs shape: torch.Size([1, 22152]) Labels shape: torch.Size([1, 22152]) Final batch size: 1, sequence length: 21316 Attention mask shape: torch.Size([1, 1, 21316, 21316]) Position ids shape: torch.Size([1, 21316]) Input IDs shape: torch.Size([1, 21316]) Labels shape: torch.Size([1, 21316]) Final batch size: 1, sequence length: 5911 Attention mask shape: torch.Size([1, 1, 5911, 5911]) Position ids shape: torch.Size([1, 5911]) Input IDs shape: torch.Size([1, 5911]) Labels shape: torch.Size([1, 5911]) Final batch size: 1, sequence length: 22784 Attention mask shape: torch.Size([1, 1, 22784, 22784]) Position ids shape: torch.Size([1, 22784]) Input IDs shape: torch.Size([1, 22784]) Labels shape: torch.Size([1, 22784]) Final batch size: 1, sequence length: 21207 Attention mask shape: torch.Size([1, 1, 21207, 21207]) Position ids shape: torch.Size([1, 21207]) Input IDs shape: torch.Size([1, 21207]) Labels shape: torch.Size([1, 21207]) Final batch size: 1, sequence length: 21163 Attention mask shape: torch.Size([1, 1, 21163, 21163]) Position ids shape: torch.Size([1, 21163]) Input IDs shape: torch.Size([1, 21163]) Labels shape: torch.Size([1, 21163]) Final batch size: 1, sequence length: 20813 Attention mask shape: torch.Size([1, 1, 20813, 20813]) Position ids shape: torch.Size([1, 20813]) Input IDs shape: torch.Size([1, 20813]) Labels shape: torch.Size([1, 20813]) Final batch size: 1, sequence length: 22197 Attention mask shape: torch.Size([1, 1, 22197, 22197]) Position ids shape: torch.Size([1, 22197]) Input IDs shape: torch.Size([1, 22197]) Labels shape: torch.Size([1, 22197]) Final batch size: 1, sequence length: 19962 Attention mask shape: torch.Size([1, 1, 19962, 19962]) Position ids shape: torch.Size([1, 19962]) Input IDs shape: torch.Size([1, 19962]) Labels shape: torch.Size([1, 19962]) Final batch size: 1, sequence length: 22117 Attention mask shape: torch.Size([1, 1, 22117, 22117]) Position ids shape: torch.Size([1, 22117]) Input IDs shape: torch.Size([1, 22117]) Labels shape: torch.Size([1, 22117]) Final batch size: 1, sequence length: 17910 Attention mask shape: torch.Size([1, 1, 17910, 17910]) Position ids shape: torch.Size([1, 17910]) Input IDs shape: torch.Size([1, 17910]) Labels shape: torch.Size([1, 17910]) Final batch size: 1, sequence length: 26921 Attention mask shape: torch.Size([1, 1, 26921, 26921]) Position ids shape: torch.Size([1, 26921]) Input IDs shape: torch.Size([1, 26921]) Labels shape: torch.Size([1, 26921]) Final batch size: 1, sequence length: 14496 Attention mask shape: torch.Size([1, 1, 14496, 14496]) Position ids shape: torch.Size([1, 14496]) Input IDs shape: torch.Size([1, 14496]) Labels shape: torch.Size([1, 14496]) Final batch size: 1, sequence length: 13031 Attention mask shape: torch.Size([1, 1, 13031, 13031]) Position ids shape: torch.Size([1, 13031]) Input IDs shape: torch.Size([1, 13031]) Labels shape: torch.Size([1, 13031]) Final batch size: 1, sequence length: 25053 Attention mask shape: torch.Size([1, 1, 25053, 25053]) Position ids shape: torch.Size([1, 25053]) Input IDs shape: torch.Size([1, 25053]) Labels shape: torch.Size([1, 25053]) Final batch size: 1, sequence length: 23338 Attention mask shape: torch.Size([1, 1, 23338, 23338]) Position ids shape: torch.Size([1, 23338]) Input IDs shape: torch.Size([1, 23338]) Labels shape: torch.Size([1, 23338]) Final batch size: 1, sequence length: 26494 Attention mask shape: torch.Size([1, 1, 26494, 26494]) Position ids shape: torch.Size([1, 26494]) Input IDs shape: torch.Size([1, 26494]) Labels shape: torch.Size([1, 26494]) Final batch size: 1, sequence length: 21962 Attention mask shape: torch.Size([1, 1, 21962, 21962]) Position ids shape: torch.Size([1, 21962]) Input IDs shape: torch.Size([1, 21962]) Labels shape: torch.Size([1, 21962]) Final batch size: 1, sequence length: 21672 Attention mask shape: torch.Size([1, 1, 21672, 21672]) Position ids shape: torch.Size([1, 21672]) Input IDs shape: torch.Size([1, 21672]) Labels shape: torch.Size([1, 21672]) Final batch size: 1, sequence length: 20101 Attention mask shape: torch.Size([1, 1, 20101, 20101]) Position ids shape: torch.Size([1, 20101]) Input IDs shape: torch.Size([1, 20101]) Labels shape: torch.Size([1, 20101]) Final batch size: 1, sequence length: 12421 Attention mask shape: torch.Size([1, 1, 12421, 12421]) Position ids shape: torch.Size([1, 12421]) Input IDs shape: torch.Size([1, 12421]) Labels shape: torch.Size([1, 12421]) Final batch size: 1, sequence length: 24365 Attention mask shape: torch.Size([1, 1, 24365, 24365]) Position ids shape: torch.Size([1, 24365]) Input IDs shape: torch.Size([1, 24365]) Labels shape: torch.Size([1, 24365]) Final batch size: 1, sequence length: 27633 Attention mask shape: torch.Size([1, 1, 27633, 27633]) Position ids shape: torch.Size([1, 27633]) Input IDs shape: torch.Size([1, 27633]) Labels shape: torch.Size([1, 27633]) Final batch size: 1, sequence length: 12224 Attention mask shape: torch.Size([1, 1, 12224, 12224]) Position ids shape: torch.Size([1, 12224]) Input IDs shape: torch.Size([1, 12224]) Labels shape: torch.Size([1, 12224]) Final batch size: 1, sequence length: 16057 Attention mask shape: torch.Size([1, 1, 16057, 16057]) Position ids shape: torch.Size([1, 16057]) Input IDs shape: torch.Size([1, 16057]) Labels shape: torch.Size([1, 16057]) Final batch size: 1, sequence length: 26303 Attention mask shape: torch.Size([1, 1, 26303, 26303]) Position ids shape: torch.Size([1, 26303]) Input IDs shape: torch.Size([1, 26303]) Labels shape: torch.Size([1, 26303]) Final batch size: 1, sequence length: 18470 Attention mask shape: torch.Size([1, 1, 18470, 18470]) Position ids shape: torch.Size([1, 18470]) Input IDs shape: torch.Size([1, 18470]) Labels shape: torch.Size([1, 18470]) Final batch size: 1, sequence length: 11515 Attention mask shape: torch.Size([1, 1, 11515, 11515]) Position ids shape: torch.Size([1, 11515]) Input IDs shape: torch.Size([1, 11515]) Labels shape: torch.Size([1, 11515]) Final batch size: 1, sequence length: 9704 Attention mask shape: torch.Size([1, 1, 9704, 9704]) Position ids shape: torch.Size([1, 9704]) Input IDs shape: torch.Size([1, 9704]) Labels shape: torch.Size([1, 9704]) Final batch size: 1, sequence length: 21581 Attention mask shape: torch.Size([1, 1, 21581, 21581]) Position ids shape: torch.Size([1, 21581]) Input IDs shape: torch.Size([1, 21581]) Labels shape: torch.Size([1, 21581]) Final batch size: 1, sequence length: 26138 Attention mask shape: torch.Size([1, 1, 26138, 26138]) Position ids shape: torch.Size([1, 26138]) Input IDs shape: torch.Size([1, 26138]) Labels shape: torch.Size([1, 26138]) Final batch size: 1, sequence length: 28897 Attention mask shape: torch.Size([1, 1, 28897, 28897]) Position ids shape: torch.Size([1, 28897]) Input IDs shape: torch.Size([1, 28897]) Labels shape: torch.Size([1, 28897]) Final batch size: 1, sequence length: 23975 Attention mask shape: torch.Size([1, 1, 23975, 23975]) Position ids shape: torch.Size([1, 23975]) Input IDs shape: torch.Size([1, 23975]) Labels shape: torch.Size([1, 23975]) Final batch size: 1, sequence length: 20198 Attention mask shape: torch.Size([1, 1, 20198, 20198]) Position ids shape: torch.Size([1, 20198]) Input IDs shape: torch.Size([1, 20198]) Labels shape: torch.Size([1, 20198]) Final batch size: 1, sequence length: 28523 Attention mask shape: torch.Size([1, 1, 28523, 28523]) Position ids shape: torch.Size([1, 28523]) Input IDs shape: torch.Size([1, 28523]) Labels shape: torch.Size([1, 28523]) Final batch size: 1, sequence length: 20184 Attention mask shape: torch.Size([1, 1, 20184, 20184]) Position ids shape: torch.Size([1, 20184]) Input IDs shape: torch.Size([1, 20184]) Labels shape: torch.Size([1, 20184]) Final batch size: 1, sequence length: 29404 Attention mask shape: torch.Size([1, 1, 29404, 29404]) Position ids shape: torch.Size([1, 29404]) Input IDs shape: torch.Size([1, 29404]) Labels shape: torch.Size([1, 29404]) Final batch size: 1, sequence length: 21766 Attention mask shape: torch.Size([1, 1, 21766, 21766]) Position ids shape: torch.Size([1, 21766]) Input IDs shape: torch.Size([1, 21766]) Labels shape: torch.Size([1, 21766]) Final batch size: 1, sequence length: 24121 Attention mask shape: torch.Size([1, 1, 24121, 24121]) Position ids shape: torch.Size([1, 24121]) Input IDs shape: torch.Size([1, 24121]) Labels shape: torch.Size([1, 24121]) Final batch size: 1, sequence length: 30592 Attention mask shape: torch.Size([1, 1, 30592, 30592]) Position ids shape: torch.Size([1, 30592]) Input IDs shape: torch.Size([1, 30592]) Labels shape: torch.Size([1, 30592]) Final batch size: 1, sequence length: 17456 Attention mask shape: torch.Size([1, 1, 17456, 17456]) Position ids shape: torch.Size([1, 17456]) Input IDs shape: torch.Size([1, 17456]) Labels shape: torch.Size([1, 17456]) Final batch size: 1, sequence length: 32473 Attention mask shape: torch.Size([1, 1, 32473, 32473]) Position ids shape: torch.Size([1, 32473]) Input IDs shape: torch.Size([1, 32473]) Labels shape: torch.Size([1, 32473]) Final batch size: 1, sequence length: 23558 Attention mask shape: torch.Size([1, 1, 23558, 23558]) Position ids shape: torch.Size([1, 23558]) Input IDs shape: torch.Size([1, 23558]) Labels shape: torch.Size([1, 23558]) Final batch size: 1, sequence length: 33482 Attention mask shape: torch.Size([1, 1, 33482, 33482]) Position ids shape: torch.Size([1, 33482]) Input IDs shape: torch.Size([1, 33482]) Labels shape: torch.Size([1, 33482]) Final batch size: 1, sequence length: 35485 Attention mask shape: torch.Size([1, 1, 35485, 35485]) Position ids shape: torch.Size([1, 35485]) Input IDs shape: torch.Size([1, 35485]) Labels shape: torch.Size([1, 35485]) Final batch size: 1, sequence length: 30965 Attention mask shape: torch.Size([1, 1, 30965, 30965]) Position ids shape: torch.Size([1, 30965]) Input IDs shape: torch.Size([1, 30965]) Labels shape: torch.Size([1, 30965]) Final batch size: 1, sequence length: 34469 Attention mask shape: torch.Size([1, 1, 34469, 34469]) Position ids shape: torch.Size([1, 34469]) Input IDs shape: torch.Size([1, 34469]) Labels shape: torch.Size([1, 34469]) Final batch size: 1, sequence length: 29481 Attention mask shape: torch.Size([1, 1, 29481, 29481]) Position ids shape: torch.Size([1, 29481]) Input IDs shape: torch.Size([1, 29481]) Labels shape: torch.Size([1, 29481]) Final batch size: 1, sequence length: 25880 Attention mask shape: torch.Size([1, 1, 25880, 25880]) Position ids shape: torch.Size([1, 25880]) Input IDs shape: torch.Size([1, 25880]) Labels shape: torch.Size([1, 25880]) Final batch size: 1, sequence length: 19620 Attention mask shape: torch.Size([1, 1, 19620, 19620]) Position ids shape: torch.Size([1, 19620]) Input IDs shape: torch.Size([1, 19620]) Labels shape: torch.Size([1, 19620]) Final batch size: 1, sequence length: 35100 Attention mask shape: torch.Size([1, 1, 35100, 35100]) Position ids shape: torch.Size([1, 35100]) Input IDs shape: torch.Size([1, 35100]) Labels shape: torch.Size([1, 35100]) Final batch size: 1, sequence length: 18126 Attention mask shape: torch.Size([1, 1, 18126, 18126]) Position ids shape: torch.Size([1, 18126]) Input IDs shape: torch.Size([1, 18126]) Labels shape: torch.Size([1, 18126]) Final batch size: 1, sequence length: 32596 Attention mask shape: torch.Size([1, 1, 32596, 32596]) Position ids shape: torch.Size([1, 32596]) Input IDs shape: torch.Size([1, 32596]) Labels shape: torch.Size([1, 32596]) Final batch size: 1, sequence length: 28773 Attention mask shape: torch.Size([1, 1, 28773, 28773]) Position ids shape: torch.Size([1, 28773]) Input IDs shape: torch.Size([1, 28773]) Labels shape: torch.Size([1, 28773]) Final batch size: 1, sequence length: 33685 Attention mask shape: torch.Size([1, 1, 33685, 33685]) Position ids shape: torch.Size([1, 33685]) Input IDs shape: torch.Size([1, 33685]) Labels shape: torch.Size([1, 33685]) Final batch size: 1, sequence length: 21491 Attention mask shape: torch.Size([1, 1, 21491, 21491]) Position ids shape: torch.Size([1, 21491]) Input IDs shape: torch.Size([1, 21491]) Labels shape: torch.Size([1, 21491]) Final batch size: 1, sequence length: 16257 Attention mask shape: torch.Size([1, 1, 16257, 16257]) Position ids shape: torch.Size([1, 16257]) Input IDs shape: torch.Size([1, 16257]) Labels shape: torch.Size([1, 16257]) Final batch size: 1, sequence length: 37040 Attention mask shape: torch.Size([1, 1, 37040, 37040]) Position ids shape: torch.Size([1, 37040]) Input IDs shape: torch.Size([1, 37040]) Labels shape: torch.Size([1, 37040]) Final batch size: 1, sequence length: 30428 Attention mask shape: torch.Size([1, 1, 30428, 30428]) Position ids shape: torch.Size([1, 30428]) Input IDs shape: torch.Size([1, 30428]) Labels shape: torch.Size([1, 30428]) Final batch size: 1, sequence length: 30072 Attention mask shape: torch.Size([1, 1, 30072, 30072]) Position ids shape: torch.Size([1, 30072]) Input IDs shape: torch.Size([1, 30072]) Labels shape: torch.Size([1, 30072]) Final batch size: 1, sequence length: 40526 Attention mask shape: torch.Size([1, 1, 40526, 40526]) Position ids shape: torch.Size([1, 40526]) Input IDs shape: torch.Size([1, 40526]) Labels shape: torch.Size([1, 40526]) Final batch size: 1, sequence length: 26215 Attention mask shape: torch.Size([1, 1, 26215, 26215]) Position ids shape: torch.Size([1, 26215]) Input IDs shape: torch.Size([1, 26215]) Labels shape: torch.Size([1, 26215]) Final batch size: 1, sequence length: 29537 Attention mask shape: torch.Size([1, 1, 29537, 29537]) Position ids shape: torch.Size([1, 29537]) Input IDs shape: torch.Size([1, 29537]) Labels shape: torch.Size([1, 29537]) Final batch size: 1, sequence length: 37980 Attention mask shape: torch.Size([1, 1, 37980, 37980]) Position ids shape: torch.Size([1, 37980]) Input IDs shape: torch.Size([1, 37980]) Labels shape: torch.Size([1, 37980]) Final batch size: 1, sequence length: 38927 Attention mask shape: torch.Size([1, 1, 38927, 38927]) Position ids shape: torch.Size([1, 38927]) Input IDs shape: torch.Size([1, 38927]) Labels shape: torch.Size([1, 38927]) Final batch size: 1, sequence length: 37992 Attention mask shape: torch.Size([1, 1, 37992, 37992]) Position ids shape: torch.Size([1, 37992]) Input IDs shape: torch.Size([1, 37992]) Labels shape: torch.Size([1, 37992]) Final batch size: 1, sequence length: 13622 Attention mask shape: torch.Size([1, 1, 13622, 13622]) Position ids shape: torch.Size([1, 13622]) Input IDs shape: torch.Size([1, 13622]) Labels shape: torch.Size([1, 13622]) Final batch size: 1, sequence length: 11608 Attention mask shape: torch.Size([1, 1, 11608, 11608]) Position ids shape: torch.Size([1, 11608]) Input IDs shape: torch.Size([1, 11608]) Labels shape: torch.Size([1, 11608]) Final batch size: 1, sequence length: 17870 Attention mask shape: torch.Size([1, 1, 17870, 17870]) Position ids shape: torch.Size([1, 17870]) Input IDs shape: torch.Size([1, 17870]) Labels shape: torch.Size([1, 17870]) Final batch size: 1, sequence length: 18454 Attention mask shape: torch.Size([1, 1, 18454, 18454]) Position ids shape: torch.Size([1, 18454]) Input IDs shape: torch.Size([1, 18454]) Labels shape: torch.Size([1, 18454]) Final batch size: 1, sequence length: 18023 Attention mask shape: torch.Size([1, 1, 18023, 18023]) Position ids shape: torch.Size([1, 18023]) Input IDs shape: torch.Size([1, 18023]) Labels shape: torch.Size([1, 18023]) Final batch size: 1, sequence length: 26635 Attention mask shape: torch.Size([1, 1, 26635, 26635]) Position ids shape: torch.Size([1, 26635]) Input IDs shape: torch.Size([1, 26635]) Labels shape: torch.Size([1, 26635]) Final batch size: 1, sequence length: 27702 Attention mask shape: torch.Size([1, 1, 27702, 27702]) Position ids shape: torch.Size([1, 27702]) Input IDs shape: torch.Size([1, 27702]) Labels shape: torch.Size([1, 27702]) Final batch size: 1, sequence length: 21250 Attention mask shape: torch.Size([1, 1, 21250, 21250]) Position ids shape: torch.Size([1, 21250]) Input IDs shape: torch.Size([1, 21250]) Labels shape: torch.Size([1, 21250]) Final batch size: 1, sequence length: 30789 Attention mask shape: torch.Size([1, 1, 30789, 30789]) Position ids shape: torch.Size([1, 30789]) Input IDs shape: torch.Size([1, 30789]) Labels shape: torch.Size([1, 30789]) Final batch size: 1, sequence length: 39589 Attention mask shape: torch.Size([1, 1, 39589, 39589]) Position ids shape: torch.Size([1, 39589]) Input IDs shape: torch.Size([1, 39589]) Labels shape: torch.Size([1, 39589]) Final batch size: 1, sequence length: 26708 Attention mask shape: torch.Size([1, 1, 26708, 26708]) Position ids shape: torch.Size([1, 26708]) Input IDs shape: torch.Size([1, 26708]) Labels shape: torch.Size([1, 26708]) Final batch size: 1, sequence length: 35030 Attention mask shape: torch.Size([1, 1, 35030, 35030]) Position ids shape: torch.Size([1, 35030]) Input IDs shape: torch.Size([1, 35030]) Labels shape: torch.Size([1, 35030]) Final batch size: 1, sequence length: 17623 Attention mask shape: torch.Size([1, 1, 17623, 17623]) Position ids shape: torch.Size([1, 17623]) Input IDs shape: torch.Size([1, 17623]) Labels shape: torch.Size([1, 17623]) Final batch size: 1, sequence length: 19702 Attention mask shape: torch.Size([1, 1, 19702, 19702]) Position ids shape: torch.Size([1, 19702]) Input IDs shape: torch.Size([1, 19702]) Labels shape: torch.Size([1, 19702]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 30772 Attention mask shape: torch.Size([1, 1, 30772, 30772]) Position ids shape: torch.Size([1, 30772]) Input IDs shape: torch.Size([1, 30772]) Labels shape: torch.Size([1, 30772]) Final batch size: 1, sequence length: 29875 Attention mask shape: torch.Size([1, 1, 29875, 29875]) Position ids shape: torch.Size([1, 29875]) Input IDs shape: torch.Size([1, 29875]) Labels shape: torch.Size([1, 29875]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 22309 Attention mask shape: torch.Size([1, 1, 22309, 22309]) Position ids shape: torch.Size([1, 22309]) Input IDs shape: torch.Size([1, 22309]) Labels shape: torch.Size([1, 22309]) Final batch size: 1, sequence length: 19538 Attention mask shape: torch.Size([1, 1, 19538, 19538]) Position ids shape: torch.Size([1, 19538]) Input IDs shape: torch.Size([1, 19538]) Labels shape: torch.Size([1, 19538]) Final batch size: 1, sequence length: 11403 Attention mask shape: torch.Size([1, 1, 11403, 11403]) Position ids shape: torch.Size([1, 11403]) Input IDs shape: torch.Size([1, 11403]) Labels shape: torch.Size([1, 11403]) Final batch size: 1, sequence length: 36580 Attention mask shape: torch.Size([1, 1, 36580, 36580]) Position ids shape: torch.Size([1, 36580]) Input IDs shape: torch.Size([1, 36580]) Labels shape: torch.Size([1, 36580]) Final batch size: 1, sequence length: 30859 Attention mask shape: torch.Size([1, 1, 30859, 30859]) Position ids shape: torch.Size([1, 30859]) Input IDs shape: torch.Size([1, 30859]) Labels shape: torch.Size([1, 30859]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 18606 Attention mask shape: torch.Size([1, 1, 18606, 18606]) Position ids shape: torch.Size([1, 18606]) Input IDs shape: torch.Size([1, 18606]) Labels shape: torch.Size([1, 18606]) Final batch size: 1, sequence length: 21936 Attention mask shape: torch.Size([1, 1, 21936, 21936]) Position ids shape: torch.Size([1, 21936]) Input IDs shape: torch.Size([1, 21936]) Labels shape: torch.Size([1, 21936]) Final batch size: 1, sequence length: 15830 Attention mask shape: torch.Size([1, 1, 15830, 15830]) Position ids shape: torch.Size([1, 15830]) Input IDs shape: torch.Size([1, 15830]) Labels shape: torch.Size([1, 15830]) Final batch size: 1, sequence length: 34684 Attention mask shape: torch.Size([1, 1, 34684, 34684]) Position ids shape: torch.Size([1, 34684]) Input IDs shape: torch.Size([1, 34684]) Labels shape: torch.Size([1, 34684]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 31745 Attention mask shape: torch.Size([1, 1, 31745, 31745]) Position ids shape: torch.Size([1, 31745]) Input IDs shape: torch.Size([1, 31745]) Labels shape: torch.Size([1, 31745]) Final batch size: 1, sequence length: 35077 Attention mask shape: torch.Size([1, 1, 35077, 35077]) Position ids shape: torch.Size([1, 35077]) Input IDs shape: torch.Size([1, 35077]) Labels shape: torch.Size([1, 35077]) Final batch size: 1, sequence length: 15993 Attention mask shape: torch.Size([1, 1, 15993, 15993]) Position ids shape: torch.Size([1, 15993]) Input IDs shape: torch.Size([1, 15993]) Labels shape: torch.Size([1, 15993]) Final batch size: 1, sequence length: 14730 Attention mask shape: torch.Size([1, 1, 14730, 14730]) Position ids shape: torch.Size([1, 14730]) Input IDs shape: torch.Size([1, 14730]) Labels shape: torch.Size([1, 14730]) Final batch size: 1, sequence length: 17811 Attention mask shape: torch.Size([1, 1, 17811, 17811]) Position ids shape: torch.Size([1, 17811]) Input IDs shape: torch.Size([1, 17811]) Labels shape: torch.Size([1, 17811]) Final batch size: 1, sequence length: 17971 Attention mask shape: torch.Size([1, 1, 17971, 17971]) Position ids shape: torch.Size([1, 17971]) Input IDs shape: torch.Size([1, 17971]) Labels shape: torch.Size([1, 17971]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 19028 Attention mask shape: torch.Size([1, 1, 19028, 19028]) Position ids shape: torch.Size([1, 19028]) Input IDs shape: torch.Size([1, 19028]) Labels shape: torch.Size([1, 19028]) Final batch size: 1, sequence length: 39661 Attention mask shape: torch.Size([1, 1, 39661, 39661]) Position ids shape: torch.Size([1, 39661]) Input IDs shape: torch.Size([1, 39661]) Labels shape: torch.Size([1, 39661]) Final batch size: 1, sequence length: 29464 Attention mask shape: torch.Size([1, 1, 29464, 29464]) Position ids shape: torch.Size([1, 29464]) Input IDs shape: torch.Size([1, 29464]) Labels shape: torch.Size([1, 29464]) Final batch size: 1, sequence length: 26665 Attention mask shape: torch.Size([1, 1, 26665, 26665]) Position ids shape: torch.Size([1, 26665]) Input IDs shape: torch.Size([1, 26665]) Labels shape: torch.Size([1, 26665]) Final batch size: 1, sequence length: 29639 Attention mask shape: torch.Size([1, 1, 29639, 29639]) Position ids shape: torch.Size([1, 29639]) Input IDs shape: torch.Size([1, 29639]) Labels shape: torch.Size([1, 29639]) Final batch size: 1, sequence length: 28814 Attention mask shape: torch.Size([1, 1, 28814, 28814]) Position ids shape: torch.Size([1, 28814]) Input IDs shape: torch.Size([1, 28814]) Labels shape: torch.Size([1, 28814]) Final batch size: 1, sequence length: 28810 Attention mask shape: torch.Size([1, 1, 28810, 28810]) Position ids shape: torch.Size([1, 28810]) Input IDs shape: torch.Size([1, 28810]) Labels shape: torch.Size([1, 28810]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 13903 Attention mask shape: torch.Size([1, 1, 13903, 13903]) Position ids shape: torch.Size([1, 13903]) Input IDs shape: torch.Size([1, 13903]) Labels shape: torch.Size([1, 13903]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 16270 Attention mask shape: torch.Size([1, 1, 16270, 16270]) Position ids shape: torch.Size([1, 16270]) Input IDs shape: torch.Size([1, 16270]) Labels shape: torch.Size([1, 16270]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 36716 Attention mask shape: torch.Size([1, 1, 36716, 36716]) Position ids shape: torch.Size([1, 36716]) Input IDs shape: torch.Size([1, 36716]) Labels shape: torch.Size([1, 36716]) Final batch size: 1, sequence length: 22945 Attention mask shape: torch.Size([1, 1, 22945, 22945]) Position ids shape: torch.Size([1, 22945]) Input IDs shape: torch.Size([1, 22945]) Labels shape: torch.Size([1, 22945]) Final batch size: 1, sequence length: 17778 Attention mask shape: torch.Size([1, 1, 17778, 17778]) Position ids shape: torch.Size([1, 17778]) Input IDs shape: torch.Size([1, 17778]) Labels shape: torch.Size([1, 17778]) Final batch size: 1, sequence length: 26930 Attention mask shape: torch.Size([1, 1, 26930, 26930]) Position ids shape: torch.Size([1, 26930]) Input IDs shape: torch.Size([1, 26930]) Labels shape: torch.Size([1, 26930]) Final batch size: 1, sequence length: 20765 Attention mask shape: torch.Size([1, 1, 20765, 20765]) Position ids shape: torch.Size([1, 20765]) Input IDs shape: torch.Size([1, 20765]) Labels shape: torch.Size([1, 20765]) Final batch size: 1, sequence length: 37407 Attention mask shape: torch.Size([1, 1, 37407, 37407]) Position ids shape: torch.Size([1, 37407]) Input IDs shape: torch.Size([1, 37407]) Labels shape: torch.Size([1, 37407]) Final batch size: 1, sequence length: 30135 Attention mask shape: torch.Size([1, 1, 30135, 30135]) Position ids shape: torch.Size([1, 30135]) Input IDs shape: torch.Size([1, 30135]) Labels shape: torch.Size([1, 30135]) Final batch size: 1, sequence length: 24232 Attention mask shape: torch.Size([1, 1, 24232, 24232]) Position ids shape: torch.Size([1, 24232]) Input IDs shape: torch.Size([1, 24232]) Labels shape: torch.Size([1, 24232]) Final batch size: 1, sequence length: 31381 Attention mask shape: torch.Size([1, 1, 31381, 31381]) Position ids shape: torch.Size([1, 31381]) Input IDs shape: torch.Size([1, 31381]) Labels shape: torch.Size([1, 31381]) Final batch size: 1, sequence length: 35952 Attention mask shape: torch.Size([1, 1, 35952, 35952]) Position ids shape: torch.Size([1, 35952]) Input IDs shape: torch.Size([1, 35952]) Labels shape: torch.Size([1, 35952]) Final batch size: 1, sequence length: 37866 Attention mask shape: torch.Size([1, 1, 37866, 37866]) Position ids shape: torch.Size([1, 37866]) Input IDs shape: torch.Size([1, 37866]) Labels shape: torch.Size([1, 37866]) Final batch size: 1, sequence length: 27281 Attention mask shape: torch.Size([1, 1, 27281, 27281]) Position ids shape: torch.Size([1, 27281]) Input IDs shape: torch.Size([1, 27281]) Labels shape: torch.Size([1, 27281]) Final batch size: 1, sequence length: 20057 Attention mask shape: torch.Size([1, 1, 20057, 20057]) Position ids shape: torch.Size([1, 20057]) Input IDs shape: torch.Size([1, 20057]) Labels shape: torch.Size([1, 20057]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 21805 Attention mask shape: torch.Size([1, 1, 21805, 21805]) Position ids shape: torch.Size([1, 21805]) Input IDs shape: torch.Size([1, 21805]) Labels shape: torch.Size([1, 21805]) Final batch size: 1, sequence length: 31420 Attention mask shape: torch.Size([1, 1, 31420, 31420]) Position ids shape: torch.Size([1, 31420]) Input IDs shape: torch.Size([1, 31420]) Labels shape: torch.Size([1, 31420]) Final batch size: 1, sequence length: 32024 Attention mask shape: torch.Size([1, 1, 32024, 32024]) Position ids shape: torch.Size([1, 32024]) Input IDs shape: torch.Size([1, 32024]) Labels shape: torch.Size([1, 32024]) Final batch size: 1, sequence length: 37397 Attention mask shape: torch.Size([1, 1, 37397, 37397]) Position ids shape: torch.Size([1, 37397]) Input IDs shape: torch.Size([1, 37397]) Labels shape: torch.Size([1, 37397]) Final batch size: 1, sequence length: 21071 Attention mask shape: torch.Size([1, 1, 21071, 21071]) Position ids shape: torch.Size([1, 21071]) Input IDs shape: torch.Size([1, 21071]) Labels shape: torch.Size([1, 21071]) Final batch size: 1, sequence length: 14104 Attention mask shape: torch.Size([1, 1, 14104, 14104]) Position ids shape: torch.Size([1, 14104]) Input IDs shape: torch.Size([1, 14104]) Labels shape: torch.Size([1, 14104]) Final batch size: 1, sequence length: 25032 Attention mask shape: torch.Size([1, 1, 25032, 25032]) Position ids shape: torch.Size([1, 25032]) Input IDs shape: torch.Size([1, 25032]) Labels shape: torch.Size([1, 25032]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 21006 Attention mask shape: torch.Size([1, 1, 21006, 21006]) Position ids shape: torch.Size([1, 21006]) Input IDs shape: torch.Size([1, 21006]) Labels shape: torch.Size([1, 21006]) Final batch size: 1, sequence length: 25388 Attention mask shape: torch.Size([1, 1, 25388, 25388]) Position ids shape: torch.Size([1, 25388]) Input IDs shape: torch.Size([1, 25388]) Labels shape: torch.Size([1, 25388]) Final batch size: 1, sequence length: 35879 Attention mask shape: torch.Size([1, 1, 35879, 35879]) Position ids shape: torch.Size([1, 35879]) Input IDs shape: torch.Size([1, 35879]) Labels shape: torch.Size([1, 35879]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 18566 Attention mask shape: torch.Size([1, 1, 18566, 18566]) Position ids shape: torch.Size([1, 18566]) Input IDs shape: torch.Size([1, 18566]) Labels shape: torch.Size([1, 18566]) Final batch size: 1, sequence length: 36415 Attention mask shape: torch.Size([1, 1, 36415, 36415]) Position ids shape: torch.Size([1, 36415]) Input IDs shape: torch.Size([1, 36415]) Labels shape: torch.Size([1, 36415]) Final batch size: 1, sequence length: 40579 Attention mask shape: torch.Size([1, 1, 40579, 40579]) Position ids shape: torch.Size([1, 40579]) Input IDs shape: torch.Size([1, 40579]) Labels shape: torch.Size([1, 40579]) Final batch size: 1, sequence length: 23187 Attention mask shape: torch.Size([1, 1, 23187, 23187]) Position ids shape: torch.Size([1, 23187]) Input IDs shape: torch.Size([1, 23187]) Labels shape: torch.Size([1, 23187]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 25383 Attention mask shape: torch.Size([1, 1, 25383, 25383]) Position ids shape: torch.Size([1, 25383]) Input IDs shape: torch.Size([1, 25383]) Labels shape: torch.Size([1, 25383]) Final batch size: 1, sequence length: 39778 Attention mask shape: torch.Size([1, 1, 39778, 39778]) Position ids shape: torch.Size([1, 39778]) Input IDs shape: torch.Size([1, 39778]) Labels shape: torch.Size([1, 39778]) Final batch size: 1, sequence length: 37530 Attention mask shape: torch.Size([1, 1, 37530, 37530]) Position ids shape: torch.Size([1, 37530]) Input IDs shape: torch.Size([1, 37530]) Labels shape: torch.Size([1, 37530]) Final batch size: 1, sequence length: 32541 Attention mask shape: torch.Size([1, 1, 32541, 32541]) Position ids shape: torch.Size([1, 32541]) Input IDs shape: torch.Size([1, 32541]) Labels shape: torch.Size([1, 32541]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 12200 Attention mask shape: torch.Size([1, 1, 12200, 12200]) Position ids shape: torch.Size([1, 12200]) Input IDs shape: torch.Size([1, 12200]) Labels shape: torch.Size([1, 12200]) Final batch size: 1, sequence length: 39587 Attention mask shape: torch.Size([1, 1, 39587, 39587]) Position ids shape: torch.Size([1, 39587]) Input IDs shape: torch.Size([1, 39587]) Labels shape: torch.Size([1, 39587]) Final batch size: 1, sequence length: 39579 Attention mask shape: torch.Size([1, 1, 39579, 39579]) Position ids shape: torch.Size([1, 39579]) Input IDs shape: torch.Size([1, 39579]) Labels shape: torch.Size([1, 39579]) Final batch size: 1, sequence length: 19668 Attention mask shape: torch.Size([1, 1, 19668, 19668]) Position ids shape: torch.Size([1, 19668]) Input IDs shape: torch.Size([1, 19668]) Labels shape: torch.Size([1, 19668]) Final batch size: 1, sequence length: 14308 Attention mask shape: torch.Size([1, 1, 14308, 14308]) Position ids shape: torch.Size([1, 14308]) Input IDs shape: torch.Size([1, 14308]) Labels shape: torch.Size([1, 14308]) Final batch size: 1, sequence length: 9251 Attention mask shape: torch.Size([1, 1, 9251, 9251]) Position ids shape: torch.Size([1, 9251]) Input IDs shape: torch.Size([1, 9251]) Labels shape: torch.Size([1, 9251]) Final batch size: 1, sequence length: 37890 Attention mask shape: torch.Size([1, 1, 37890, 37890]) Position ids shape: torch.Size([1, 37890]) Input IDs shape: torch.Size([1, 37890]) Labels shape: torch.Size([1, 37890]) Final batch size: 1, sequence length: 36647 Attention mask shape: torch.Size([1, 1, 36647, 36647]) Position ids shape: torch.Size([1, 36647]) Input IDs shape: torch.Size([1, 36647]) Labels shape: torch.Size([1, 36647]) {'loss': 0.3458, 'grad_norm': 0.9969348971306272, 'learning_rate': 9.890738003669029e-06, 'num_tokens': -inf, 'epoch': 1.12} Final batch size: 1, sequence length: 4858 Attention mask shape: torch.Size([1, 1, 4858, 4858]) Position ids shape: torch.Size([1, 4858]) Input IDs shape: torch.Size([1, 4858]) Labels shape: torch.Size([1, 4858]) Final batch size: 1, sequence length: 6316 Attention mask shape: torch.Size([1, 1, 6316, 6316]) Position ids shape: torch.Size([1, 6316]) Input IDs shape: torch.Size([1, 6316]) Labels shape: torch.Size([1, 6316]) Final batch size: 1, sequence length: 11448 Attention mask shape: torch.Size([1, 1, 11448, 11448]) Position ids shape: torch.Size([1, 11448]) Input IDs shape: torch.Size([1, 11448]) Labels shape: torch.Size([1, 11448]) Final batch size: 1, sequence length: 12075 Attention mask shape: torch.Size([1, 1, 12075, 12075]) Position ids shape: torch.Size([1, 12075]) Input IDs shape: torch.Size([1, 12075]) Labels shape: torch.Size([1, 12075]) Final batch size: 1, sequence length: 12846 Attention mask shape: torch.Size([1, 1, 12846, 12846]) Position ids shape: torch.Size([1, 12846]) Input IDs shape: torch.Size([1, 12846]) Labels shape: torch.Size([1, 12846]) Final batch size: 1, sequence length: 12945 Attention mask shape: torch.Size([1, 1, 12945, 12945]) Position ids shape: torch.Size([1, 12945]) Input IDs shape: torch.Size([1, 12945]) Labels shape: torch.Size([1, 12945]) Final batch size: 1, sequence length: 7360 Attention mask shape: torch.Size([1, 1, 7360, 7360]) Position ids shape: torch.Size([1, 7360]) Input IDs shape: torch.Size([1, 7360]) Labels shape: torch.Size([1, 7360]) Final batch size: 1, sequence length: 15189 Attention mask shape: torch.Size([1, 1, 15189, 15189]) Position ids shape: torch.Size([1, 15189]) Input IDs shape: torch.Size([1, 15189]) Labels shape: torch.Size([1, 15189]) Final batch size: 1, sequence length: 14330 Attention mask shape: torch.Size([1, 1, 14330, 14330]) Position ids shape: torch.Size([1, 14330]) Input IDs shape: torch.Size([1, 14330]) Labels shape: torch.Size([1, 14330]) Final batch size: 1, sequence length: 14891 Attention mask shape: torch.Size([1, 1, 14891, 14891]) Position ids shape: torch.Size([1, 14891]) Input IDs shape: torch.Size([1, 14891]) Labels shape: torch.Size([1, 14891]) Final batch size: 1, sequence length: 17246 Attention mask shape: torch.Size([1, 1, 17246, 17246]) Position ids shape: torch.Size([1, 17246]) Input IDs shape: torch.Size([1, 17246]) Labels shape: torch.Size([1, 17246]) Final batch size: 1, sequence length: 16658 Attention mask shape: torch.Size([1, 1, 16658, 16658]) Position ids shape: torch.Size([1, 16658]) Input IDs shape: torch.Size([1, 16658]) Labels shape: torch.Size([1, 16658]) Final batch size: 1, sequence length: 16326 Attention mask shape: torch.Size([1, 1, 16326, 16326]) Position ids shape: torch.Size([1, 16326]) Input IDs shape: torch.Size([1, 16326]) Labels shape: torch.Size([1, 16326]) Final batch size: 1, sequence length: 16961 Attention mask shape: torch.Size([1, 1, 16961, 16961]) Position ids shape: torch.Size([1, 16961]) Input IDs shape: torch.Size([1, 16961]) Labels shape: torch.Size([1, 16961]) Final batch size: 1, sequence length: 18400 Attention mask shape: torch.Size([1, 1, 18400, 18400]) Position ids shape: torch.Size([1, 18400]) Input IDs shape: torch.Size([1, 18400]) Labels shape: torch.Size([1, 18400]) Final batch size: 1, sequence length: 16536 Attention mask shape: torch.Size([1, 1, 16536, 16536]) Position ids shape: torch.Size([1, 16536]) Input IDs shape: torch.Size([1, 16536]) Labels shape: torch.Size([1, 16536]) Final batch size: 1, sequence length: 19332 Attention mask shape: torch.Size([1, 1, 19332, 19332]) Position ids shape: torch.Size([1, 19332]) Input IDs shape: torch.Size([1, 19332]) Labels shape: torch.Size([1, 19332]) Final batch size: 1, sequence length: 16391 Attention mask shape: torch.Size([1, 1, 16391, 16391]) Position ids shape: torch.Size([1, 16391]) Input IDs shape: torch.Size([1, 16391]) Labels shape: torch.Size([1, 16391]) Final batch size: 1, sequence length: 16831 Attention mask shape: torch.Size([1, 1, 16831, 16831]) Position ids shape: torch.Size([1, 16831]) Input IDs shape: torch.Size([1, 16831]) Labels shape: torch.Size([1, 16831]) Final batch size: 1, sequence length: 21122 Attention mask shape: torch.Size([1, 1, 21122, 21122]) Position ids shape: torch.Size([1, 21122]) Input IDs shape: torch.Size([1, 21122]) Labels shape: torch.Size([1, 21122]) Final batch size: 1, sequence length: 18341 Attention mask shape: torch.Size([1, 1, 18341, 18341]) Position ids shape: torch.Size([1, 18341]) Input IDs shape: torch.Size([1, 18341]) Labels shape: torch.Size([1, 18341]) Final batch size: 1, sequence length: 18014 Attention mask shape: torch.Size([1, 1, 18014, 18014]) Position ids shape: torch.Size([1, 18014]) Input IDs shape: torch.Size([1, 18014]) Labels shape: torch.Size([1, 18014]) Final batch size: 1, sequence length: 19999 Attention mask shape: torch.Size([1, 1, 19999, 19999]) Position ids shape: torch.Size([1, 19999]) Input IDs shape: torch.Size([1, 19999]) Labels shape: torch.Size([1, 19999]) Final batch size: 1, sequence length: 20888 Attention mask shape: torch.Size([1, 1, 20888, 20888]) Position ids shape: torch.Size([1, 20888]) Input IDs shape: torch.Size([1, 20888]) Labels shape: torch.Size([1, 20888]) Final batch size: 1, sequence length: 20912 Attention mask shape: torch.Size([1, 1, 20912, 20912]) Position ids shape: torch.Size([1, 20912]) Input IDs shape: torch.Size([1, 20912]) Labels shape: torch.Size([1, 20912]) Final batch size: 1, sequence length: 21617 Attention mask shape: torch.Size([1, 1, 21617, 21617]) Position ids shape: torch.Size([1, 21617]) Input IDs shape: torch.Size([1, 21617]) Labels shape: torch.Size([1, 21617]) Final batch size: 1, sequence length: 20118 Attention mask shape: torch.Size([1, 1, 20118, 20118]) Position ids shape: torch.Size([1, 20118]) Input IDs shape: torch.Size([1, 20118]) Labels shape: torch.Size([1, 20118]) Final batch size: 1, sequence length: 19597 Attention mask shape: torch.Size([1, 1, 19597, 19597]) Position ids shape: torch.Size([1, 19597]) Input IDs shape: torch.Size([1, 19597]) Labels shape: torch.Size([1, 19597]) Final batch size: 1, sequence length: 13638 Attention mask shape: torch.Size([1, 1, 13638, 13638]) Position ids shape: torch.Size([1, 13638]) Input IDs shape: torch.Size([1, 13638]) Labels shape: torch.Size([1, 13638]) Final batch size: 1, sequence length: 18393 Attention mask shape: torch.Size([1, 1, 18393, 18393]) Position ids shape: torch.Size([1, 18393]) Input IDs shape: torch.Size([1, 18393]) Labels shape: torch.Size([1, 18393]) Final batch size: 1, sequence length: 18424 Attention mask shape: torch.Size([1, 1, 18424, 18424]) Position ids shape: torch.Size([1, 18424]) Input IDs shape: torch.Size([1, 18424]) Labels shape: torch.Size([1, 18424]) Final batch size: 1, sequence length: 21739 Attention mask shape: torch.Size([1, 1, 21739, 21739]) Position ids shape: torch.Size([1, 21739]) Input IDs shape: torch.Size([1, 21739]) Labels shape: torch.Size([1, 21739]) Final batch size: 1, sequence length: 22854 Attention mask shape: torch.Size([1, 1, 22854, 22854]) Position ids shape: torch.Size([1, 22854]) Input IDs shape: torch.Size([1, 22854]) Labels shape: torch.Size([1, 22854]) Final batch size: 1, sequence length: 21611 Attention mask shape: torch.Size([1, 1, 21611, 21611]) Position ids shape: torch.Size([1, 21611]) Input IDs shape: torch.Size([1, 21611]) Labels shape: torch.Size([1, 21611]) Final batch size: 1, sequence length: 12328 Attention mask shape: torch.Size([1, 1, 12328, 12328]) Position ids shape: torch.Size([1, 12328]) Input IDs shape: torch.Size([1, 12328]) Labels shape: torch.Size([1, 12328]) Final batch size: 1, sequence length: 21314 Attention mask shape: torch.Size([1, 1, 21314, 21314]) Position ids shape: torch.Size([1, 21314]) Input IDs shape: torch.Size([1, 21314]) Labels shape: torch.Size([1, 21314]) Final batch size: 1, sequence length: 24597 Attention mask shape: torch.Size([1, 1, 24597, 24597]) Position ids shape: torch.Size([1, 24597]) Input IDs shape: torch.Size([1, 24597]) Labels shape: torch.Size([1, 24597]) Final batch size: 1, sequence length: 6839 Attention mask shape: torch.Size([1, 1, 6839, 6839]) Position ids shape: torch.Size([1, 6839]) Input IDs shape: torch.Size([1, 6839]) Labels shape: torch.Size([1, 6839]) Final batch size: 1, sequence length: 9341 Attention mask shape: torch.Size([1, 1, 9341, 9341]) Position ids shape: torch.Size([1, 9341]) Input IDs shape: torch.Size([1, 9341]) Labels shape: torch.Size([1, 9341]) Final batch size: 1, sequence length: 26218 Attention mask shape: torch.Size([1, 1, 26218, 26218]) Position ids shape: torch.Size([1, 26218]) Input IDs shape: torch.Size([1, 26218]) Labels shape: torch.Size([1, 26218]) Final batch size: 1, sequence length: 23614 Attention mask shape: torch.Size([1, 1, 23614, 23614]) Position ids shape: torch.Size([1, 23614]) Input IDs shape: torch.Size([1, 23614]) Labels shape: torch.Size([1, 23614]) Final batch size: 1, sequence length: 25909 Attention mask shape: torch.Size([1, 1, 25909, 25909]) Position ids shape: torch.Size([1, 25909]) Input IDs shape: torch.Size([1, 25909]) Labels shape: torch.Size([1, 25909]) Final batch size: 1, sequence length: 19885 Attention mask shape: torch.Size([1, 1, 19885, 19885]) Position ids shape: torch.Size([1, 19885]) Input IDs shape: torch.Size([1, 19885]) Labels shape: torch.Size([1, 19885]) Final batch size: 1, sequence length: 15026 Attention mask shape: torch.Size([1, 1, 15026, 15026]) Position ids shape: torch.Size([1, 15026]) Input IDs shape: torch.Size([1, 15026]) Labels shape: torch.Size([1, 15026]) Final batch size: 1, sequence length: 24342 Attention mask shape: torch.Size([1, 1, 24342, 24342]) Position ids shape: torch.Size([1, 24342]) Input IDs shape: torch.Size([1, 24342]) Labels shape: torch.Size([1, 24342]) Final batch size: 1, sequence length: 25575 Attention mask shape: torch.Size([1, 1, 25575, 25575]) Position ids shape: torch.Size([1, 25575]) Input IDs shape: torch.Size([1, 25575]) Labels shape: torch.Size([1, 25575]) Final batch size: 1, sequence length: 24090 Attention mask shape: torch.Size([1, 1, 24090, 24090]) Position ids shape: torch.Size([1, 24090]) Input IDs shape: torch.Size([1, 24090]) Labels shape: torch.Size([1, 24090]) Final batch size: 1, sequence length: 17780 Attention mask shape: torch.Size([1, 1, 17780, 17780]) Position ids shape: torch.Size([1, 17780]) Input IDs shape: torch.Size([1, 17780]) Labels shape: torch.Size([1, 17780]) Final batch size: 1, sequence length: 15232 Attention mask shape: torch.Size([1, 1, 15232, 15232]) Position ids shape: torch.Size([1, 15232]) Input IDs shape: torch.Size([1, 15232]) Labels shape: torch.Size([1, 15232]) Final batch size: 1, sequence length: 25042 Attention mask shape: torch.Size([1, 1, 25042, 25042]) Position ids shape: torch.Size([1, 25042]) Input IDs shape: torch.Size([1, 25042]) Labels shape: torch.Size([1, 25042]) Final batch size: 1, sequence length: 21725 Attention mask shape: torch.Size([1, 1, 21725, 21725]) Position ids shape: torch.Size([1, 21725]) Input IDs shape: torch.Size([1, 21725]) Labels shape: torch.Size([1, 21725]) Final batch size: 1, sequence length: 28348 Attention mask shape: torch.Size([1, 1, 28348, 28348]) Position ids shape: torch.Size([1, 28348]) Input IDs shape: torch.Size([1, 28348]) Labels shape: torch.Size([1, 28348]) Final batch size: 1, sequence length: 21677 Attention mask shape: torch.Size([1, 1, 21677, 21677]) Position ids shape: torch.Size([1, 21677]) Input IDs shape: torch.Size([1, 21677]) Labels shape: torch.Size([1, 21677]) Final batch size: 1, sequence length: 30366 Attention mask shape: torch.Size([1, 1, 30366, 30366]) Position ids shape: torch.Size([1, 30366]) Input IDs shape: torch.Size([1, 30366]) Labels shape: torch.Size([1, 30366]) Final batch size: 1, sequence length: 16611 Attention mask shape: torch.Size([1, 1, 16611, 16611]) Position ids shape: torch.Size([1, 16611]) Input IDs shape: torch.Size([1, 16611]) Labels shape: torch.Size([1, 16611]) Final batch size: 1, sequence length: 20619 Attention mask shape: torch.Size([1, 1, 20619, 20619]) Position ids shape: torch.Size([1, 20619]) Input IDs shape: torch.Size([1, 20619]) Labels shape: torch.Size([1, 20619]) Final batch size: 1, sequence length: 22217 Attention mask shape: torch.Size([1, 1, 22217, 22217]) Position ids shape: torch.Size([1, 22217]) Input IDs shape: torch.Size([1, 22217]) Labels shape: torch.Size([1, 22217]) Final batch size: 1, sequence length: 29730 Attention mask shape: torch.Size([1, 1, 29730, 29730]) Position ids shape: torch.Size([1, 29730]) Input IDs shape: torch.Size([1, 29730]) Labels shape: torch.Size([1, 29730]) Final batch size: 1, sequence length: 13646 Attention mask shape: torch.Size([1, 1, 13646, 13646]) Position ids shape: torch.Size([1, 13646]) Input IDs shape: torch.Size([1, 13646]) Labels shape: torch.Size([1, 13646]) Final batch size: 1, sequence length: 30410 Attention mask shape: torch.Size([1, 1, 30410, 30410]) Position ids shape: torch.Size([1, 30410]) Input IDs shape: torch.Size([1, 30410]) Labels shape: torch.Size([1, 30410]) Final batch size: 1, sequence length: 27795 Attention mask shape: torch.Size([1, 1, 27795, 27795]) Position ids shape: torch.Size([1, 27795]) Input IDs shape: torch.Size([1, 27795]) Labels shape: torch.Size([1, 27795]) Final batch size: 1, sequence length: 30766 Attention mask shape: torch.Size([1, 1, 30766, 30766]) Position ids shape: torch.Size([1, 30766]) Input IDs shape: torch.Size([1, 30766]) Labels shape: torch.Size([1, 30766]) Final batch size: 1, sequence length: 20702 Attention mask shape: torch.Size([1, 1, 20702, 20702]) Position ids shape: torch.Size([1, 20702]) Input IDs shape: torch.Size([1, 20702]) Labels shape: torch.Size([1, 20702]) Final batch size: 1, sequence length: 28678 Attention mask shape: torch.Size([1, 1, 28678, 28678]) Position ids shape: torch.Size([1, 28678]) Input IDs shape: torch.Size([1, 28678]) Labels shape: torch.Size([1, 28678]) Final batch size: 1, sequence length: 21623 Attention mask shape: torch.Size([1, 1, 21623, 21623]) Position ids shape: torch.Size([1, 21623]) Input IDs shape: torch.Size([1, 21623]) Labels shape: torch.Size([1, 21623]) Final batch size: 1, sequence length: 20915 Attention mask shape: torch.Size([1, 1, 20915, 20915]) Position ids shape: torch.Size([1, 20915]) Input IDs shape: torch.Size([1, 20915]) Labels shape: torch.Size([1, 20915]) Final batch size: 1, sequence length: 30796 Attention mask shape: torch.Size([1, 1, 30796, 30796]) Position ids shape: torch.Size([1, 30796]) Input IDs shape: torch.Size([1, 30796]) Labels shape: torch.Size([1, 30796]) Final batch size: 1, sequence length: 32034 Attention mask shape: torch.Size([1, 1, 32034, 32034]) Position ids shape: torch.Size([1, 32034]) Input IDs shape: torch.Size([1, 32034]) Labels shape: torch.Size([1, 32034]) Final batch size: 1, sequence length: 30917 Attention mask shape: torch.Size([1, 1, 30917, 30917]) Position ids shape: torch.Size([1, 30917]) Input IDs shape: torch.Size([1, 30917]) Labels shape: torch.Size([1, 30917]) Final batch size: 1, sequence length: 35628 Attention mask shape: torch.Size([1, 1, 35628, 35628]) Position ids shape: torch.Size([1, 35628]) Input IDs shape: torch.Size([1, 35628]) Labels shape: torch.Size([1, 35628]) Final batch size: 1, sequence length: 14070 Attention mask shape: torch.Size([1, 1, 14070, 14070]) Position ids shape: torch.Size([1, 14070]) Input IDs shape: torch.Size([1, 14070]) Labels shape: torch.Size([1, 14070]) Final batch size: 1, sequence length: 16403 Attention mask shape: torch.Size([1, 1, 16403, 16403]) Position ids shape: torch.Size([1, 16403]) Input IDs shape: torch.Size([1, 16403]) Final batch size: 1, sequence length: 35781 Labels shape: torch.Size([1, 16403]) Attention mask shape: torch.Size([1, 1, 35781, 35781]) Position ids shape: torch.Size([1, 35781]) Input IDs shape: torch.Size([1, 35781]) Labels shape: torch.Size([1, 35781]) Final batch size: 1, sequence length: 25698 Attention mask shape: torch.Size([1, 1, 25698, 25698]) Position ids shape: torch.Size([1, 25698]) Input IDs shape: torch.Size([1, 25698]) Labels shape: torch.Size([1, 25698]) Final batch size: 1, sequence length: 20028 Attention mask shape: torch.Size([1, 1, 20028, 20028]) Position ids shape: torch.Size([1, 20028]) Input IDs shape: torch.Size([1, 20028]) Labels shape: torch.Size([1, 20028]) Final batch size: 1, sequence length: 32308 Attention mask shape: torch.Size([1, 1, 32308, 32308]) Position ids shape: torch.Size([1, 32308]) Input IDs shape: torch.Size([1, 32308]) Labels shape: torch.Size([1, 32308]) Final batch size: 1, sequence length: 32630 Attention mask shape: torch.Size([1, 1, 32630, 32630]) Position ids shape: torch.Size([1, 32630]) Input IDs shape: torch.Size([1, 32630]) Labels shape: torch.Size([1, 32630]) Final batch size: 1, sequence length: 35697 Attention mask shape: torch.Size([1, 1, 35697, 35697]) Position ids shape: torch.Size([1, 35697]) Input IDs shape: torch.Size([1, 35697]) Labels shape: torch.Size([1, 35697]) Final batch size: 1, sequence length: 23626 Attention mask shape: torch.Size([1, 1, 23626, 23626]) Position ids shape: torch.Size([1, 23626]) Input IDs shape: torch.Size([1, 23626]) Labels shape: torch.Size([1, 23626]) Final batch size: 1, sequence length: 38362 Attention mask shape: torch.Size([1, 1, 38362, 38362]) Position ids shape: torch.Size([1, 38362]) Input IDs shape: torch.Size([1, 38362]) Labels shape: torch.Size([1, 38362]) Final batch size: 1, sequence length: 21562 Attention mask shape: torch.Size([1, 1, 21562, 21562]) Position ids shape: torch.Size([1, 21562]) Input IDs shape: torch.Size([1, 21562]) Labels shape: torch.Size([1, 21562]) Final batch size: 1, sequence length: 32239 Attention mask shape: torch.Size([1, 1, 32239, 32239]) Position ids shape: torch.Size([1, 32239]) Input IDs shape: torch.Size([1, 32239]) Labels shape: torch.Size([1, 32239]) Final batch size: 1, sequence length: 37195 Attention mask shape: torch.Size([1, 1, 37195, 37195]) Position ids shape: torch.Size([1, 37195]) Input IDs shape: torch.Size([1, 37195]) Labels shape: torch.Size([1, 37195]) Final batch size: 1, sequence length: 29202 Attention mask shape: torch.Size([1, 1, 29202, 29202]) Position ids shape: torch.Size([1, 29202]) Input IDs shape: torch.Size([1, 29202]) Labels shape: torch.Size([1, 29202]) Final batch size: 1, sequence length: 38190 Attention mask shape: torch.Size([1, 1, 38190, 38190]) Position ids shape: torch.Size([1, 38190]) Input IDs shape: torch.Size([1, 38190]) Labels shape: torch.Size([1, 38190]) Final batch size: 1, sequence length: 11896 Attention mask shape: torch.Size([1, 1, 11896, 11896]) Position ids shape: torch.Size([1, 11896]) Input IDs shape: torch.Size([1, 11896]) Labels shape: torch.Size([1, 11896]) Final batch size: 1, sequence length: 38420 Attention mask shape: torch.Size([1, 1, 38420, 38420]) Position ids shape: torch.Size([1, 38420]) Input IDs shape: torch.Size([1, 38420]) Labels shape: torch.Size([1, 38420]) Final batch size: 1, sequence length: 39608 Attention mask shape: torch.Size([1, 1, 39608, 39608]) Position ids shape: torch.Size([1, 39608]) Input IDs shape: torch.Size([1, 39608]) Labels shape: torch.Size([1, 39608]) Final batch size: 1, sequence length: 15726 Attention mask shape: torch.Size([1, 1, 15726, 15726]) Position ids shape: torch.Size([1, 15726]) Input IDs shape: torch.Size([1, 15726]) Labels shape: torch.Size([1, 15726]) Final batch size: 1, sequence length: 22199 Attention mask shape: torch.Size([1, 1, 22199, 22199]) Position ids shape: torch.Size([1, 22199]) Input IDs shape: torch.Size([1, 22199]) Labels shape: torch.Size([1, 22199]) Final batch size: 1, sequence length: 34802 Attention mask shape: torch.Size([1, 1, 34802, 34802]) Position ids shape: torch.Size([1, 34802]) Input IDs shape: torch.Size([1, 34802]) Labels shape: torch.Size([1, 34802]) Final batch size: 1, sequence length: 22768 Attention mask shape: torch.Size([1, 1, 22768, 22768]) Position ids shape: torch.Size([1, 22768]) Input IDs shape: torch.Size([1, 22768]) Labels shape: torch.Size([1, 22768]) Final batch size: 1, sequence length: 14828 Attention mask shape: torch.Size([1, 1, 14828, 14828]) Position ids shape: torch.Size([1, 14828]) Input IDs shape: torch.Size([1, 14828]) Labels shape: torch.Size([1, 14828]) Final batch size: 1, sequence length: 18568 Attention mask shape: torch.Size([1, 1, 18568, 18568]) Position ids shape: torch.Size([1, 18568]) Input IDs shape: torch.Size([1, 18568]) Labels shape: torch.Size([1, 18568]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 26840 Attention mask shape: torch.Size([1, 1, 26840, 26840]) Position ids shape: torch.Size([1, 26840]) Input IDs shape: torch.Size([1, 26840]) Labels shape: torch.Size([1, 26840]) Final batch size: 1, sequence length: 31143 Attention mask shape: torch.Size([1, 1, 31143, 31143]) Position ids shape: torch.Size([1, 31143]) Input IDs shape: torch.Size([1, 31143]) Labels shape: torch.Size([1, 31143]) Final batch size: 1, sequence length: 10426 Attention mask shape: torch.Size([1, 1, 10426, 10426]) Position ids shape: torch.Size([1, 10426]) Input IDs shape: torch.Size([1, 10426]) Labels shape: torch.Size([1, 10426]) Final batch size: 1, sequence length: 26287 Attention mask shape: torch.Size([1, 1, 26287, 26287]) Position ids shape: torch.Size([1, 26287]) Input IDs shape: torch.Size([1, 26287]) Labels shape: torch.Size([1, 26287]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 30651 Attention mask shape: torch.Size([1, 1, 30651, 30651]) Position ids shape: torch.Size([1, 30651]) Input IDs shape: torch.Size([1, 30651]) Labels shape: torch.Size([1, 30651]) Final batch size: 1, sequence length: 16304 Attention mask shape: torch.Size([1, 1, 16304, 16304]) Position ids shape: torch.Size([1, 16304]) Input IDs shape: torch.Size([1, 16304]) Labels shape: torch.Size([1, 16304]) Final batch size: 1, sequence length: 40088 Attention mask shape: torch.Size([1, 1, 40088, 40088]) Position ids shape: torch.Size([1, 40088]) Input IDs shape: torch.Size([1, 40088]) Labels shape: torch.Size([1, 40088]) Final batch size: 1, sequence length: 34785 Attention mask shape: torch.Size([1, 1, 34785, 34785]) Position ids shape: torch.Size([1, 34785]) Input IDs shape: torch.Size([1, 34785]) Labels shape: torch.Size([1, 34785]) Final batch size: 1, sequence length: 32559 Attention mask shape: torch.Size([1, 1, 32559, 32559]) Position ids shape: torch.Size([1, 32559]) Input IDs shape: torch.Size([1, 32559]) Labels shape: torch.Size([1, 32559]) Final batch size: 1, sequence length: 35914 Attention mask shape: torch.Size([1, 1, 35914, 35914]) Position ids shape: torch.Size([1, 35914]) Input IDs shape: torch.Size([1, 35914]) Labels shape: torch.Size([1, 35914]) Final batch size: 1, sequence length: 37529 Attention mask shape: torch.Size([1, 1, 37529, 37529]) Position ids shape: torch.Size([1, 37529]) Input IDs shape: torch.Size([1, 37529]) Labels shape: torch.Size([1, 37529]) Final batch size: 1, sequence length: 27819 Attention mask shape: torch.Size([1, 1, 27819, 27819]) Position ids shape: torch.Size([1, 27819]) Input IDs shape: torch.Size([1, 27819]) Labels shape: torch.Size([1, 27819]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 10198 Attention mask shape: torch.Size([1, 1, 10198, 10198]) Position ids shape: torch.Size([1, 10198]) Input IDs shape: torch.Size([1, 10198]) Labels shape: torch.Size([1, 10198]) Final batch size: 1, sequence length: 18859 Attention mask shape: torch.Size([1, 1, 18859, 18859]) Position ids shape: torch.Size([1, 18859]) Input IDs shape: torch.Size([1, 18859]) Labels shape: torch.Size([1, 18859]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 36638 Attention mask shape: torch.Size([1, 1, 36638, 36638]) Position ids shape: torch.Size([1, 36638]) Input IDs shape: torch.Size([1, 36638]) Labels shape: torch.Size([1, 36638]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 19494 Attention mask shape: torch.Size([1, 1, 19494, 19494]) Position ids shape: torch.Size([1, 19494]) Input IDs shape: torch.Size([1, 19494]) Labels shape: torch.Size([1, 19494]) Final batch size: 1, sequence length: 40763 Attention mask shape: torch.Size([1, 1, 40763, 40763]) Position ids shape: torch.Size([1, 40763]) Input IDs shape: torch.Size([1, 40763]) Labels shape: torch.Size([1, 40763]) Final batch size: 1, sequence length: 31754 Attention mask shape: torch.Size([1, 1, 31754, 31754]) Position ids shape: torch.Size([1, 31754]) Input IDs shape: torch.Size([1, 31754]) Labels shape: torch.Size([1, 31754]) Final batch size: 1, sequence length: 30835 Attention mask shape: torch.Size([1, 1, 30835, 30835]) Position ids shape: torch.Size([1, 30835]) Input IDs shape: torch.Size([1, 30835]) Labels shape: torch.Size([1, 30835]) Final batch size: 1, sequence length: 31316 Attention mask shape: torch.Size([1, 1, 31316, 31316]) Position ids shape: torch.Size([1, 31316]) Input IDs shape: torch.Size([1, 31316]) Labels shape: torch.Size([1, 31316]) Final batch size: 1, sequence length: 23936 Attention mask shape: torch.Size([1, 1, 23936, 23936]) Position ids shape: torch.Size([1, 23936]) Input IDs shape: torch.Size([1, 23936]) Labels shape: torch.Size([1, 23936]) Final batch size: 1, sequence length: 12605 Attention mask shape: torch.Size([1, 1, 12605, 12605]) Position ids shape: torch.Size([1, 12605]) Input IDs shape: torch.Size([1, 12605]) Labels shape: torch.Size([1, 12605]) Final batch size: 1, sequence length: 26269 Attention mask shape: torch.Size([1, 1, 26269, 26269]) Position ids shape: torch.Size([1, 26269]) Input IDs shape: torch.Size([1, 26269]) Labels shape: torch.Size([1, 26269]) Final batch size: 1, sequence length: 25573 Attention mask shape: torch.Size([1, 1, 25573, 25573]) Position ids shape: torch.Size([1, 25573]) Input IDs shape: torch.Size([1, 25573]) Labels shape: torch.Size([1, 25573]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 37170 Attention mask shape: torch.Size([1, 1, 37170, 37170]) Position ids shape: torch.Size([1, 37170]) Input IDs shape: torch.Size([1, 37170]) Labels shape: torch.Size([1, 37170]) {'loss': 0.3584, 'grad_norm': 0.8542427194110321, 'learning_rate': 9.829629131445342e-06, 'num_tokens': -inf, 'epoch': 1.25} Final batch size: 1, sequence length: 7998 Attention mask shape: torch.Size([1, 1, 7998, 7998]) Position ids shape: torch.Size([1, 7998]) Input IDs shape: torch.Size([1, 7998]) Labels shape: torch.Size([1, 7998]) Final batch size: 1, sequence length: 6925 Attention mask shape: torch.Size([1, 1, 6925, 6925]) Position ids shape: torch.Size([1, 6925]) Input IDs shape: torch.Size([1, 6925]) Labels shape: torch.Size([1, 6925]) Final batch size: 1, sequence length: 7402 Attention mask shape: torch.Size([1, 1, 7402, 7402]) Position ids shape: torch.Size([1, 7402]) Input IDs shape: torch.Size([1, 7402]) Labels shape: torch.Size([1, 7402]) Final batch size: 1, sequence length: 10102 Attention mask shape: torch.Size([1, 1, 10102, 10102]) Position ids shape: torch.Size([1, 10102]) Input IDs shape: torch.Size([1, 10102]) Labels shape: torch.Size([1, 10102]) Final batch size: 1, sequence length: 9243 Attention mask shape: torch.Size([1, 1, 9243, 9243]) Position ids shape: torch.Size([1, 9243]) Input IDs shape: torch.Size([1, 9243]) Labels shape: torch.Size([1, 9243]) Final batch size: 1, sequence length: 9452 Attention mask shape: torch.Size([1, 1, 9452, 9452]) Position ids shape: torch.Size([1, 9452]) Input IDs shape: torch.Size([1, 9452]) Labels shape: torch.Size([1, 9452]) Final batch size: 1, sequence length: 10862 Attention mask shape: torch.Size([1, 1, 10862, 10862]) Position ids shape: torch.Size([1, 10862]) Input IDs shape: torch.Size([1, 10862]) Labels shape: torch.Size([1, 10862]) Final batch size: 1, sequence length: 10804 Attention mask shape: torch.Size([1, 1, 10804, 10804]) Position ids shape: torch.Size([1, 10804]) Input IDs shape: torch.Size([1, 10804]) Labels shape: torch.Size([1, 10804]) Final batch size: 1, sequence length: 11947 Attention mask shape: torch.Size([1, 1, 11947, 11947]) Position ids shape: torch.Size([1, 11947]) Input IDs shape: torch.Size([1, 11947]) Labels shape: torch.Size([1, 11947]) Final batch size: 1, sequence length: 12719 Attention mask shape: torch.Size([1, 1, 12719, 12719]) Position ids shape: torch.Size([1, 12719]) Input IDs shape: torch.Size([1, 12719]) Labels shape: torch.Size([1, 12719]) Final batch size: 1, sequence length: 11623 Attention mask shape: torch.Size([1, 1, 11623, 11623]) Position ids shape: torch.Size([1, 11623]) Input IDs shape: torch.Size([1, 11623]) Labels shape: torch.Size([1, 11623]) Final batch size: 1, sequence length: 15794 Attention mask shape: torch.Size([1, 1, 15794, 15794]) Position ids shape: torch.Size([1, 15794]) Input IDs shape: torch.Size([1, 15794]) Labels shape: torch.Size([1, 15794]) Final batch size: 1, sequence length: 8432 Attention mask shape: torch.Size([1, 1, 8432, 8432]) Position ids shape: torch.Size([1, 8432]) Input IDs shape: torch.Size([1, 8432]) Labels shape: torch.Size([1, 8432]) Final batch size: 1, sequence length: 15886 Attention mask shape: torch.Size([1, 1, 15886, 15886]) Position ids shape: torch.Size([1, 15886]) Input IDs shape: torch.Size([1, 15886]) Labels shape: torch.Size([1, 15886]) Final batch size: 1, sequence length: 16060 Attention mask shape: torch.Size([1, 1, 16060, 16060]) Position ids shape: torch.Size([1, 16060]) Input IDs shape: torch.Size([1, 16060]) Labels shape: torch.Size([1, 16060]) Final batch size: 1, sequence length: 13209 Attention mask shape: torch.Size([1, 1, 13209, 13209]) Position ids shape: torch.Size([1, 13209]) Input IDs shape: torch.Size([1, 13209]) Labels shape: torch.Size([1, 13209]) Final batch size: 1, sequence length: 15053 Attention mask shape: torch.Size([1, 1, 15053, 15053]) Position ids shape: torch.Size([1, 15053]) Input IDs shape: torch.Size([1, 15053]) Labels shape: torch.Size([1, 15053]) Final batch size: 1, sequence length: 17376 Attention mask shape: torch.Size([1, 1, 17376, 17376]) Position ids shape: torch.Size([1, 17376]) Input IDs shape: torch.Size([1, 17376]) Labels shape: torch.Size([1, 17376]) Final batch size: 1, sequence length: 17918 Attention mask shape: torch.Size([1, 1, 17918, 17918]) Position ids shape: torch.Size([1, 17918]) Input IDs shape: torch.Size([1, 17918]) Labels shape: torch.Size([1, 17918]) Final batch size: 1, sequence length: 17370 Attention mask shape: torch.Size([1, 1, 17370, 17370]) Position ids shape: torch.Size([1, 17370]) Input IDs shape: torch.Size([1, 17370]) Labels shape: torch.Size([1, 17370]) Final batch size: 1, sequence length: 16536 Attention mask shape: torch.Size([1, 1, 16536, 16536]) Position ids shape: torch.Size([1, 16536]) Input IDs shape: torch.Size([1, 16536]) Labels shape: torch.Size([1, 16536]) Final batch size: 1, sequence length: 19935 Attention mask shape: torch.Size([1, 1, 19935, 19935]) Position ids shape: torch.Size([1, 19935]) Input IDs shape: torch.Size([1, 19935]) Labels shape: torch.Size([1, 19935]) Final batch size: 1, sequence length: 19609 Attention mask shape: torch.Size([1, 1, 19609, 19609]) Position ids shape: torch.Size([1, 19609]) Input IDs shape: torch.Size([1, 19609]) Labels shape: torch.Size([1, 19609]) Final batch size: 1, sequence length: 17767 Attention mask shape: torch.Size([1, 1, 17767, 17767]) Position ids shape: torch.Size([1, 17767]) Input IDs shape: torch.Size([1, 17767]) Labels shape: torch.Size([1, 17767]) Final batch size: 1, sequence length: 19629 Attention mask shape: torch.Size([1, 1, 19629, 19629]) Position ids shape: torch.Size([1, 19629]) Input IDs shape: torch.Size([1, 19629]) Labels shape: torch.Size([1, 19629]) Final batch size: 1, sequence length: 21117 Attention mask shape: torch.Size([1, 1, 21117, 21117]) Position ids shape: torch.Size([1, 21117]) Input IDs shape: torch.Size([1, 21117]) Labels shape: torch.Size([1, 21117]) Final batch size: 1, sequence length: 17625 Attention mask shape: torch.Size([1, 1, 17625, 17625]) Position ids shape: torch.Size([1, 17625]) Input IDs shape: torch.Size([1, 17625]) Labels shape: torch.Size([1, 17625]) Final batch size: 1, sequence length: 20056 Attention mask shape: torch.Size([1, 1, 20056, 20056]) Position ids shape: torch.Size([1, 20056]) Input IDs shape: torch.Size([1, 20056]) Labels shape: torch.Size([1, 20056]) Final batch size: 1, sequence length: 20554 Attention mask shape: torch.Size([1, 1, 20554, 20554]) Position ids shape: torch.Size([1, 20554]) Input IDs shape: torch.Size([1, 20554]) Labels shape: torch.Size([1, 20554]) Final batch size: 1, sequence length: 20185 Attention mask shape: torch.Size([1, 1, 20185, 20185]) Position ids shape: torch.Size([1, 20185]) Input IDs shape: torch.Size([1, 20185]) Labels shape: torch.Size([1, 20185]) Final batch size: 1, sequence length: 17665 Attention mask shape: torch.Size([1, 1, 17665, 17665]) Position ids shape: torch.Size([1, 17665]) Input IDs shape: torch.Size([1, 17665]) Labels shape: torch.Size([1, 17665]) Final batch size: 1, sequence length: 21980 Attention mask shape: torch.Size([1, 1, 21980, 21980]) Position ids shape: torch.Size([1, 21980]) Input IDs shape: torch.Size([1, 21980]) Labels shape: torch.Size([1, 21980]) Final batch size: 1, sequence length: 8498 Attention mask shape: torch.Size([1, 1, 8498, 8498]) Position ids shape: torch.Size([1, 8498]) Input IDs shape: torch.Size([1, 8498]) Labels shape: torch.Size([1, 8498]) Final batch size: 1, sequence length: 12006 Attention mask shape: torch.Size([1, 1, 12006, 12006]) Position ids shape: torch.Size([1, 12006]) Input IDs shape: torch.Size([1, 12006]) Labels shape: torch.Size([1, 12006]) Final batch size: 1, sequence length: 22797 Attention mask shape: torch.Size([1, 1, 22797, 22797]) Position ids shape: torch.Size([1, 22797]) Input IDs shape: torch.Size([1, 22797]) Labels shape: torch.Size([1, 22797]) Final batch size: 1, sequence length: 21771 Attention mask shape: torch.Size([1, 1, 21771, 21771]) Position ids shape: torch.Size([1, 21771]) Input IDs shape: torch.Size([1, 21771]) Labels shape: torch.Size([1, 21771]) Final batch size: 1, sequence length: 22683 Attention mask shape: torch.Size([1, 1, 22683, 22683]) Position ids shape: torch.Size([1, 22683]) Input IDs shape: torch.Size([1, 22683]) Labels shape: torch.Size([1, 22683]) Final batch size: 1, sequence length: 21421 Attention mask shape: torch.Size([1, 1, 21421, 21421]) Position ids shape: torch.Size([1, 21421]) Input IDs shape: torch.Size([1, 21421]) Labels shape: torch.Size([1, 21421]) Final batch size: 1, sequence length: 16439 Attention mask shape: torch.Size([1, 1, 16439, 16439]) Position ids shape: torch.Size([1, 16439]) Input IDs shape: torch.Size([1, 16439]) Labels shape: torch.Size([1, 16439]) Final batch size: 1, sequence length: 20524 Attention mask shape: torch.Size([1, 1, 20524, 20524]) Position ids shape: torch.Size([1, 20524]) Input IDs shape: torch.Size([1, 20524]) Labels shape: torch.Size([1, 20524]) Final batch size: 1, sequence length: 22208 Attention mask shape: torch.Size([1, 1, 22208, 22208]) Position ids shape: torch.Size([1, 22208]) Input IDs shape: torch.Size([1, 22208]) Labels shape: torch.Size([1, 22208]) Final batch size: 1, sequence length: 16141 Attention mask shape: torch.Size([1, 1, 16141, 16141]) Position ids shape: torch.Size([1, 16141]) Input IDs shape: torch.Size([1, 16141]) Labels shape: torch.Size([1, 16141]) Final batch size: 1, sequence length: 24308 Attention mask shape: torch.Size([1, 1, 24308, 24308]) Position ids shape: torch.Size([1, 24308]) Input IDs shape: torch.Size([1, 24308]) Labels shape: torch.Size([1, 24308]) Final batch size: 1, sequence length: 18991 Attention mask shape: torch.Size([1, 1, 18991, 18991]) Position ids shape: torch.Size([1, 18991]) Input IDs shape: torch.Size([1, 18991]) Labels shape: torch.Size([1, 18991]) Final batch size: 1, sequence length: 24515 Attention mask shape: torch.Size([1, 1, 24515, 24515]) Position ids shape: torch.Size([1, 24515]) Input IDs shape: torch.Size([1, 24515]) Labels shape: torch.Size([1, 24515]) Final batch size: 1, sequence length: 20816 Attention mask shape: torch.Size([1, 1, 20816, 20816]) Position ids shape: torch.Size([1, 20816]) Input IDs shape: torch.Size([1, 20816]) Labels shape: torch.Size([1, 20816]) Final batch size: 1, sequence length: 26142 Attention mask shape: torch.Size([1, 1, 26142, 26142]) Position ids shape: torch.Size([1, 26142]) Input IDs shape: torch.Size([1, 26142]) Labels shape: torch.Size([1, 26142]) Final batch size: 1, sequence length: 27179 Attention mask shape: torch.Size([1, 1, 27179, 27179]) Position ids shape: torch.Size([1, 27179]) Input IDs shape: torch.Size([1, 27179]) Labels shape: torch.Size([1, 27179]) Final batch size: 1, sequence length: 26449 Attention mask shape: torch.Size([1, 1, 26449, 26449]) Position ids shape: torch.Size([1, 26449]) Input IDs shape: torch.Size([1, 26449]) Labels shape: torch.Size([1, 26449]) Final batch size: 1, sequence length: 10857 Attention mask shape: torch.Size([1, 1, 10857, 10857]) Position ids shape: torch.Size([1, 10857]) Input IDs shape: torch.Size([1, 10857]) Labels shape: torch.Size([1, 10857]) Final batch size: 1, sequence length: 26121 Attention mask shape: torch.Size([1, 1, 26121, 26121]) Position ids shape: torch.Size([1, 26121]) Input IDs shape: torch.Size([1, 26121]) Labels shape: torch.Size([1, 26121]) Final batch size: 1, sequence length: 24433 Attention mask shape: torch.Size([1, 1, 24433, 24433]) Position ids shape: torch.Size([1, 24433]) Input IDs shape: torch.Size([1, 24433]) Labels shape: torch.Size([1, 24433]) Final batch size: 1, sequence length: 29628 Attention mask shape: torch.Size([1, 1, 29628, 29628]) Position ids shape: torch.Size([1, 29628]) Input IDs shape: torch.Size([1, 29628]) Labels shape: torch.Size([1, 29628]) Final batch size: 1, sequence length: 18122 Attention mask shape: torch.Size([1, 1, 18122, 18122]) Position ids shape: torch.Size([1, 18122]) Input IDs shape: torch.Size([1, 18122]) Labels shape: torch.Size([1, 18122]) Final batch size: 1, sequence length: 16564 Attention mask shape: torch.Size([1, 1, 16564, 16564]) Position ids shape: torch.Size([1, 16564]) Input IDs shape: torch.Size([1, 16564]) Labels shape: torch.Size([1, 16564]) Final batch size: 1, sequence length: 28164 Attention mask shape: torch.Size([1, 1, 28164, 28164]) Position ids shape: torch.Size([1, 28164]) Input IDs shape: torch.Size([1, 28164]) Labels shape: torch.Size([1, 28164]) Final batch size: 1, sequence length: 21152 Attention mask shape: torch.Size([1, 1, 21152, 21152]) Position ids shape: torch.Size([1, 21152]) Input IDs shape: torch.Size([1, 21152]) Labels shape: torch.Size([1, 21152]) Final batch size: 1, sequence length: 30197 Attention mask shape: torch.Size([1, 1, 30197, 30197]) Position ids shape: torch.Size([1, 30197]) Input IDs shape: torch.Size([1, 30197]) Labels shape: torch.Size([1, 30197]) Final batch size: 1, sequence length: 32466 Attention mask shape: torch.Size([1, 1, 32466, 32466]) Position ids shape: torch.Size([1, 32466]) Input IDs shape: torch.Size([1, 32466]) Labels shape: torch.Size([1, 32466]) Final batch size: 1, sequence length: 16837 Attention mask shape: torch.Size([1, 1, 16837, 16837]) Position ids shape: torch.Size([1, 16837]) Input IDs shape: torch.Size([1, 16837]) Labels shape: torch.Size([1, 16837]) Final batch size: 1, sequence length: 24802 Attention mask shape: torch.Size([1, 1, 24802, 24802]) Position ids shape: torch.Size([1, 24802]) Input IDs shape: torch.Size([1, 24802]) Labels shape: torch.Size([1, 24802]) Final batch size: 1, sequence length: 17363 Attention mask shape: torch.Size([1, 1, 17363, 17363]) Position ids shape: torch.Size([1, 17363]) Input IDs shape: torch.Size([1, 17363]) Labels shape: torch.Size([1, 17363]) Final batch size: 1, sequence length: 31507 Attention mask shape: torch.Size([1, 1, 31507, 31507]) Position ids shape: torch.Size([1, 31507]) Input IDs shape: torch.Size([1, 31507]) Labels shape: torch.Size([1, 31507]) Final batch size: 1, sequence length: 32752 Attention mask shape: torch.Size([1, 1, 32752, 32752]) Position ids shape: torch.Size([1, 32752]) Input IDs shape: torch.Size([1, 32752]) Labels shape: torch.Size([1, 32752]) Final batch size: 1, sequence length: 31339 Attention mask shape: torch.Size([1, 1, 31339, 31339]) Position ids shape: torch.Size([1, 31339]) Input IDs shape: torch.Size([1, 31339]) Labels shape: torch.Size([1, 31339]) Final batch size: 1, sequence length: 33894 Attention mask shape: torch.Size([1, 1, 33894, 33894]) Position ids shape: torch.Size([1, 33894]) Input IDs shape: torch.Size([1, 33894]) Labels shape: torch.Size([1, 33894]) Final batch size: 1, sequence length: 24527 Attention mask shape: torch.Size([1, 1, 24527, 24527]) Position ids shape: torch.Size([1, 24527]) Input IDs shape: torch.Size([1, 24527]) Labels shape: torch.Size([1, 24527]) Final batch size: 1, sequence length: 29392 Attention mask shape: torch.Size([1, 1, 29392, 29392]) Position ids shape: torch.Size([1, 29392]) Input IDs shape: torch.Size([1, 29392]) Labels shape: torch.Size([1, 29392]) Final batch size: 1, sequence length: 27061 Attention mask shape: torch.Size([1, 1, 27061, 27061]) Position ids shape: torch.Size([1, 27061]) Input IDs shape: torch.Size([1, 27061]) Labels shape: torch.Size([1, 27061]) Final batch size: 1, sequence length: 22762 Attention mask shape: torch.Size([1, 1, 22762, 22762]) Position ids shape: torch.Size([1, 22762]) Input IDs shape: torch.Size([1, 22762]) Labels shape: torch.Size([1, 22762]) Final batch size: 1, sequence length: 30181 Attention mask shape: torch.Size([1, 1, 30181, 30181]) Position ids shape: torch.Size([1, 30181]) Input IDs shape: torch.Size([1, 30181]) Labels shape: torch.Size([1, 30181]) Final batch size: 1, sequence length: 33794 Attention mask shape: torch.Size([1, 1, 33794, 33794]) Position ids shape: torch.Size([1, 33794]) Input IDs shape: torch.Size([1, 33794]) Labels shape: torch.Size([1, 33794]) Final batch size: 1, sequence length: 31525 Attention mask shape: torch.Size([1, 1, 31525, 31525]) Position ids shape: torch.Size([1, 31525]) Input IDs shape: torch.Size([1, 31525]) Labels shape: torch.Size([1, 31525]) Final batch size: 1, sequence length: 11107 Attention mask shape: torch.Size([1, 1, 11107, 11107]) Position ids shape: torch.Size([1, 11107]) Input IDs shape: torch.Size([1, 11107]) Labels shape: torch.Size([1, 11107]) Final batch size: 1, sequence length: 21143 Attention mask shape: torch.Size([1, 1, 21143, 21143]) Position ids shape: torch.Size([1, 21143]) Input IDs shape: torch.Size([1, 21143]) Labels shape: torch.Size([1, 21143]) Final batch size: 1, sequence length: 20274 Attention mask shape: torch.Size([1, 1, 20274, 20274]) Position ids shape: torch.Size([1, 20274]) Input IDs shape: torch.Size([1, 20274]) Labels shape: torch.Size([1, 20274]) Final batch size: 1, sequence length: 34711 Attention mask shape: torch.Size([1, 1, 34711, 34711]) Position ids shape: torch.Size([1, 34711]) Input IDs shape: torch.Size([1, 34711]) Labels shape: torch.Size([1, 34711]) Final batch size: 1, sequence length: 32891 Attention mask shape: torch.Size([1, 1, 32891, 32891]) Position ids shape: torch.Size([1, 32891]) Input IDs shape: torch.Size([1, 32891]) Labels shape: torch.Size([1, 32891]) Final batch size: 1, sequence length: 34512 Attention mask shape: torch.Size([1, 1, 34512, 34512]) Position ids shape: torch.Size([1, 34512]) Input IDs shape: torch.Size([1, 34512]) Labels shape: torch.Size([1, 34512]) Final batch size: 1, sequence length: 17181 Attention mask shape: torch.Size([1, 1, 17181, 17181]) Position ids shape: torch.Size([1, 17181]) Input IDs shape: torch.Size([1, 17181]) Labels shape: torch.Size([1, 17181]) Final batch size: 1, sequence length: 37343 Attention mask shape: torch.Size([1, 1, 37343, 37343]) Position ids shape: torch.Size([1, 37343]) Input IDs shape: torch.Size([1, 37343]) Labels shape: torch.Size([1, 37343]) Final batch size: 1, sequence length: 13942 Attention mask shape: torch.Size([1, 1, 13942, 13942]) Position ids shape: torch.Size([1, 13942]) Input IDs shape: torch.Size([1, 13942]) Labels shape: torch.Size([1, 13942]) Final batch size: 1, sequence length: 34405 Attention mask shape: torch.Size([1, 1, 34405, 34405]) Position ids shape: torch.Size([1, 34405]) Input IDs shape: torch.Size([1, 34405]) Labels shape: torch.Size([1, 34405]) Final batch size: 1, sequence length: 18229 Attention mask shape: torch.Size([1, 1, 18229, 18229]) Position ids shape: torch.Size([1, 18229]) Input IDs shape: torch.Size([1, 18229]) Labels shape: torch.Size([1, 18229]) Final batch size: 1, sequence length: 31561 Attention mask shape: torch.Size([1, 1, 31561, 31561]) Position ids shape: torch.Size([1, 31561]) Input IDs shape: torch.Size([1, 31561]) Labels shape: torch.Size([1, 31561]) Final batch size: 1, sequence length: 33087 Attention mask shape: torch.Size([1, 1, 33087, 33087]) Position ids shape: torch.Size([1, 33087]) Input IDs shape: torch.Size([1, 33087]) Labels shape: torch.Size([1, 33087]) Final batch size: 1, sequence length: 16716 Attention mask shape: torch.Size([1, 1, 16716, 16716]) Position ids shape: torch.Size([1, 16716]) Input IDs shape: torch.Size([1, 16716]) Labels shape: torch.Size([1, 16716]) Final batch size: 1, sequence length: 31875 Attention mask shape: torch.Size([1, 1, 31875, 31875]) Position ids shape: torch.Size([1, 31875]) Input IDs shape: torch.Size([1, 31875]) Labels shape: torch.Size([1, 31875]) Final batch size: 1, sequence length: 37555 Attention mask shape: torch.Size([1, 1, 37555, 37555]) Position ids shape: torch.Size([1, 37555]) Input IDs shape: torch.Size([1, 37555]) Labels shape: torch.Size([1, 37555]) Final batch size: 1, sequence length: 35523 Attention mask shape: torch.Size([1, 1, 35523, 35523]) Position ids shape: torch.Size([1, 35523]) Input IDs shape: torch.Size([1, 35523]) Labels shape: torch.Size([1, 35523]) Final batch size: 1, sequence length: 38010 Attention mask shape: torch.Size([1, 1, 38010, 38010]) Position ids shape: torch.Size([1, 38010]) Input IDs shape: torch.Size([1, 38010]) Labels shape: torch.Size([1, 38010]) Final batch size: 1, sequence length: 29216 Attention mask shape: torch.Size([1, 1, 29216, 29216]) Position ids shape: torch.Size([1, 29216]) Input IDs shape: torch.Size([1, 29216]) Labels shape: torch.Size([1, 29216]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 33367 Attention mask shape: torch.Size([1, 1, 33367, 33367]) Position ids shape: torch.Size([1, 33367]) Input IDs shape: torch.Size([1, 33367]) Labels shape: torch.Size([1, 33367]) Final batch size: 1, sequence length: 13978 Attention mask shape: torch.Size([1, 1, 13978, 13978]) Position ids shape: torch.Size([1, 13978]) Input IDs shape: torch.Size([1, 13978]) Labels shape: torch.Size([1, 13978]) Final batch size: 1, sequence length: 31518 Attention mask shape: torch.Size([1, 1, 31518, 31518]) Position ids shape: torch.Size([1, 31518]) Input IDs shape: torch.Size([1, 31518]) Labels shape: torch.Size([1, 31518]) Final batch size: 1, sequence length: 16433 Attention mask shape: torch.Size([1, 1, 16433, 16433]) Position ids shape: torch.Size([1, 16433]) Input IDs shape: torch.Size([1, 16433]) Labels shape: torch.Size([1, 16433]) Final batch size: 1, sequence length: 40910 Attention mask shape: torch.Size([1, 1, 40910, 40910]) Position ids shape: torch.Size([1, 40910]) Input IDs shape: torch.Size([1, 40910]) Labels shape: torch.Size([1, 40910]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 36450 Attention mask shape: torch.Size([1, 1, 36450, 36450]) Position ids shape: torch.Size([1, 36450]) Input IDs shape: torch.Size([1, 36450]) Labels shape: torch.Size([1, 36450]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 36464 Attention mask shape: torch.Size([1, 1, 36464, 36464]) Position ids shape: torch.Size([1, 36464]) Input IDs shape: torch.Size([1, 36464]) Labels shape: torch.Size([1, 36464]) Final batch size: 1, sequence length: 37349 Attention mask shape: torch.Size([1, 1, 37349, 37349]) Position ids shape: torch.Size([1, 37349]) Input IDs shape: torch.Size([1, 37349]) Labels shape: torch.Size([1, 37349]) Final batch size: 1, sequence length: 29042 Attention mask shape: torch.Size([1, 1, 29042, 29042]) Position ids shape: torch.Size([1, 29042]) Input IDs shape: torch.Size([1, 29042]) Labels shape: torch.Size([1, 29042]) Final batch size: 1, sequence length: 21859 Attention mask shape: torch.Size([1, 1, 21859, 21859]) Position ids shape: torch.Size([1, 21859]) Input IDs shape: torch.Size([1, 21859]) Labels shape: torch.Size([1, 21859]) Final batch size: 1, sequence length: 30134 Attention mask shape: torch.Size([1, 1, 30134, 30134]) Position ids shape: torch.Size([1, 30134]) Input IDs shape: torch.Size([1, 30134]) Labels shape: torch.Size([1, 30134]) Final batch size: 1, sequence length: 19287 Attention mask shape: torch.Size([1, 1, 19287, 19287]) Position ids shape: torch.Size([1, 19287]) Input IDs shape: torch.Size([1, 19287]) Labels shape: torch.Size([1, 19287]) Final batch size: 1, sequence length: 26922 Attention mask shape: torch.Size([1, 1, 26922, 26922]) Position ids shape: torch.Size([1, 26922]) Input IDs shape: torch.Size([1, 26922]) Labels shape: torch.Size([1, 26922]) Final batch size: 1, sequence length: 31449 Attention mask shape: torch.Size([1, 1, 31449, 31449]) Position ids shape: torch.Size([1, 31449]) Input IDs shape: torch.Size([1, 31449]) Labels shape: torch.Size([1, 31449]) Final batch size: 1, sequence length: 17758 Attention mask shape: torch.Size([1, 1, 17758, 17758]) Position ids shape: torch.Size([1, 17758]) Input IDs shape: torch.Size([1, 17758]) Labels shape: torch.Size([1, 17758]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40752 Attention mask shape: torch.Size([1, 1, 40752, 40752]) Position ids shape: torch.Size([1, 40752]) Input IDs shape: torch.Size([1, 40752]) Labels shape: torch.Size([1, 40752]) Final batch size: 1, sequence length: 39258 Attention mask shape: torch.Size([1, 1, 39258, 39258]) Position ids shape: torch.Size([1, 39258]) Input IDs shape: torch.Size([1, 39258]) Labels shape: torch.Size([1, 39258]) Final batch size: 1, sequence length: 38529 Attention mask shape: torch.Size([1, 1, 38529, 38529]) Position ids shape: torch.Size([1, 38529]) Input IDs shape: torch.Size([1, 38529]) Labels shape: torch.Size([1, 38529]) Final batch size: 1, sequence length: 39169 Attention mask shape: torch.Size([1, 1, 39169, 39169]) Position ids shape: torch.Size([1, 39169]) Input IDs shape: torch.Size([1, 39169]) Labels shape: torch.Size([1, 39169]) Final batch size: 1, sequence length: 17535 Attention mask shape: torch.Size([1, 1, 17535, 17535]) Position ids shape: torch.Size([1, 17535]) Input IDs shape: torch.Size([1, 17535]) Labels shape: torch.Size([1, 17535]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 26893 Attention mask shape: torch.Size([1, 1, 26893, 26893]) Position ids shape: torch.Size([1, 26893]) Input IDs shape: torch.Size([1, 26893]) Labels shape: torch.Size([1, 26893]) Final batch size: 1, sequence length: 17890 Attention mask shape: torch.Size([1, 1, 17890, 17890]) Position ids shape: torch.Size([1, 17890]) Input IDs shape: torch.Size([1, 17890]) Labels shape: torch.Size([1, 17890]) Final batch size: 1, sequence length: 38891 Attention mask shape: torch.Size([1, 1, 38891, 38891]) Position ids shape: torch.Size([1, 38891]) Input IDs shape: torch.Size([1, 38891]) Labels shape: torch.Size([1, 38891]) Final batch size: 1, sequence length: 32638 Attention mask shape: torch.Size([1, 1, 32638, 32638]) Position ids shape: torch.Size([1, 32638]) Input IDs shape: torch.Size([1, 32638]) Labels shape: torch.Size([1, 32638]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 32465 Attention mask shape: torch.Size([1, 1, 32465, 32465]) Position ids shape: torch.Size([1, 32465]) Input IDs shape: torch.Size([1, 32465]) Labels shape: torch.Size([1, 32465]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 18471 Attention mask shape: torch.Size([1, 1, 18471, 18471]) Position ids shape: torch.Size([1, 18471]) Input IDs shape: torch.Size([1, 18471]) Labels shape: torch.Size([1, 18471]) Final batch size: 1, sequence length: 36599 Attention mask shape: torch.Size([1, 1, 36599, 36599]) Position ids shape: torch.Size([1, 36599]) Input IDs shape: torch.Size([1, 36599]) Labels shape: torch.Size([1, 36599]) Final batch size: 1, sequence length: 39953 Attention mask shape: torch.Size([1, 1, 39953, 39953]) Position ids shape: torch.Size([1, 39953]) Input IDs shape: torch.Size([1, 39953]) Labels shape: torch.Size([1, 39953]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) {'loss': 0.3418, 'grad_norm': 0.7526366562730208, 'learning_rate': 9.755282581475769e-06, 'num_tokens': -inf, 'epoch': 1.38} Final batch size: 1, sequence length: 7977 Attention mask shape: torch.Size([1, 1, 7977, 7977]) Position ids shape: torch.Size([1, 7977]) Input IDs shape: torch.Size([1, 7977]) Labels shape: torch.Size([1, 7977]) Final batch size: 1, sequence length: 6215 Attention mask shape: torch.Size([1, 1, 6215, 6215]) Position ids shape: torch.Size([1, 6215]) Input IDs shape: torch.Size([1, 6215]) Labels shape: torch.Size([1, 6215]) Final batch size: 1, sequence length: 10107 Attention mask shape: torch.Size([1, 1, 10107, 10107]) Position ids shape: torch.Size([1, 10107]) Input IDs shape: torch.Size([1, 10107]) Labels shape: torch.Size([1, 10107]) Final batch size: 1, sequence length: 6095 Attention mask shape: torch.Size([1, 1, 6095, 6095]) Position ids shape: torch.Size([1, 6095]) Input IDs shape: torch.Size([1, 6095]) Labels shape: torch.Size([1, 6095]) Final batch size: 1, sequence length: 8500 Attention mask shape: torch.Size([1, 1, 8500, 8500]) Position ids shape: torch.Size([1, 8500]) Input IDs shape: torch.Size([1, 8500]) Labels shape: torch.Size([1, 8500]) Final batch size: 1, sequence length: 10080 Attention mask shape: torch.Size([1, 1, 10080, 10080]) Position ids shape: torch.Size([1, 10080]) Input IDs shape: torch.Size([1, 10080]) Labels shape: torch.Size([1, 10080]) Final batch size: 1, sequence length: 12454 Attention mask shape: torch.Size([1, 1, 12454, 12454]) Position ids shape: torch.Size([1, 12454]) Input IDs shape: torch.Size([1, 12454]) Labels shape: torch.Size([1, 12454]) Final batch size: 1, sequence length: 5818 Attention mask shape: torch.Size([1, 1, 5818, 5818]) Position ids shape: torch.Size([1, 5818]) Input IDs shape: torch.Size([1, 5818]) Labels shape: torch.Size([1, 5818]) Final batch size: 1, sequence length: 12928 Attention mask shape: torch.Size([1, 1, 12928, 12928]) Position ids shape: torch.Size([1, 12928]) Input IDs shape: torch.Size([1, 12928]) Labels shape: torch.Size([1, 12928]) Final batch size: 1, sequence length: 10505 Attention mask shape: torch.Size([1, 1, 10505, 10505]) Position ids shape: torch.Size([1, 10505]) Input IDs shape: torch.Size([1, 10505]) Labels shape: torch.Size([1, 10505]) Final batch size: 1, sequence length: 13624 Attention mask shape: torch.Size([1, 1, 13624, 13624]) Position ids shape: torch.Size([1, 13624]) Input IDs shape: torch.Size([1, 13624]) Labels shape: torch.Size([1, 13624]) Final batch size: 1, sequence length: 13092 Attention mask shape: torch.Size([1, 1, 13092, 13092]) Position ids shape: torch.Size([1, 13092]) Input IDs shape: torch.Size([1, 13092]) Labels shape: torch.Size([1, 13092]) Final batch size: 1, sequence length: 13459 Attention mask shape: torch.Size([1, 1, 13459, 13459]) Position ids shape: torch.Size([1, 13459]) Input IDs shape: torch.Size([1, 13459]) Labels shape: torch.Size([1, 13459]) Final batch size: 1, sequence length: 12556 Attention mask shape: torch.Size([1, 1, 12556, 12556]) Position ids shape: torch.Size([1, 12556]) Input IDs shape: torch.Size([1, 12556]) Labels shape: torch.Size([1, 12556]) Final batch size: 1, sequence length: 9217 Attention mask shape: torch.Size([1, 1, 9217, 9217]) Position ids shape: torch.Size([1, 9217]) Input IDs shape: torch.Size([1, 9217]) Labels shape: torch.Size([1, 9217]) Final batch size: 1, sequence length: 10687 Attention mask shape: torch.Size([1, 1, 10687, 10687]) Position ids shape: torch.Size([1, 10687]) Input IDs shape: torch.Size([1, 10687]) Labels shape: torch.Size([1, 10687]) Final batch size: 1, sequence length: 12826 Attention mask shape: torch.Size([1, 1, 12826, 12826]) Position ids shape: torch.Size([1, 12826]) Input IDs shape: torch.Size([1, 12826]) Labels shape: torch.Size([1, 12826]) Final batch size: 1, sequence length: 9379 Attention mask shape: torch.Size([1, 1, 9379, 9379]) Position ids shape: torch.Size([1, 9379]) Input IDs shape: torch.Size([1, 9379]) Labels shape: torch.Size([1, 9379]) Final batch size: 1, sequence length: 16782 Attention mask shape: torch.Size([1, 1, 16782, 16782]) Position ids shape: torch.Size([1, 16782]) Input IDs shape: torch.Size([1, 16782]) Labels shape: torch.Size([1, 16782]) Final batch size: 1, sequence length: 10469 Attention mask shape: torch.Size([1, 1, 10469, 10469]) Position ids shape: torch.Size([1, 10469]) Input IDs shape: torch.Size([1, 10469]) Labels shape: torch.Size([1, 10469]) Final batch size: 1, sequence length: 10390 Attention mask shape: torch.Size([1, 1, 10390, 10390]) Position ids shape: torch.Size([1, 10390]) Input IDs shape: torch.Size([1, 10390]) Labels shape: torch.Size([1, 10390]) Final batch size: 1, sequence length: 17417 Attention mask shape: torch.Size([1, 1, 17417, 17417]) Position ids shape: torch.Size([1, 17417]) Input IDs shape: torch.Size([1, 17417]) Labels shape: torch.Size([1, 17417]) Final batch size: 1, sequence length: 19671 Attention mask shape: torch.Size([1, 1, 19671, 19671]) Position ids shape: torch.Size([1, 19671]) Input IDs shape: torch.Size([1, 19671]) Labels shape: torch.Size([1, 19671]) Final batch size: 1, sequence length: 15185 Attention mask shape: torch.Size([1, 1, 15185, 15185]) Position ids shape: torch.Size([1, 15185]) Input IDs shape: torch.Size([1, 15185]) Labels shape: torch.Size([1, 15185]) Final batch size: 1, sequence length: 17980 Attention mask shape: torch.Size([1, 1, 17980, 17980]) Position ids shape: torch.Size([1, 17980]) Input IDs shape: torch.Size([1, 17980]) Labels shape: torch.Size([1, 17980]) Final batch size: 1, sequence length: 20243 Attention mask shape: torch.Size([1, 1, 20243, 20243]) Position ids shape: torch.Size([1, 20243]) Input IDs shape: torch.Size([1, 20243]) Labels shape: torch.Size([1, 20243]) Final batch size: 1, sequence length: 19187 Attention mask shape: torch.Size([1, 1, 19187, 19187]) Position ids shape: torch.Size([1, 19187]) Input IDs shape: torch.Size([1, 19187]) Labels shape: torch.Size([1, 19187]) Final batch size: 1, sequence length: 10277 Attention mask shape: torch.Size([1, 1, 10277, 10277]) Position ids shape: torch.Size([1, 10277]) Input IDs shape: torch.Size([1, 10277]) Labels shape: torch.Size([1, 10277]) Final batch size: 1, sequence length: 15263 Attention mask shape: torch.Size([1, 1, 15263, 15263]) Position ids shape: torch.Size([1, 15263]) Input IDs shape: torch.Size([1, 15263]) Labels shape: torch.Size([1, 15263]) Final batch size: 1, sequence length: 18469 Attention mask shape: torch.Size([1, 1, 18469, 18469]) Position ids shape: torch.Size([1, 18469]) Input IDs shape: torch.Size([1, 18469]) Labels shape: torch.Size([1, 18469]) Final batch size: 1, sequence length: 18454 Attention mask shape: torch.Size([1, 1, 18454, 18454]) Position ids shape: torch.Size([1, 18454]) Input IDs shape: torch.Size([1, 18454]) Labels shape: torch.Size([1, 18454]) Final batch size: 1, sequence length: 19683 Attention mask shape: torch.Size([1, 1, 19683, 19683]) Position ids shape: torch.Size([1, 19683]) Input IDs shape: torch.Size([1, 19683]) Labels shape: torch.Size([1, 19683]) Final batch size: 1, sequence length: 20191 Attention mask shape: torch.Size([1, 1, 20191, 20191]) Position ids shape: torch.Size([1, 20191]) Input IDs shape: torch.Size([1, 20191]) Labels shape: torch.Size([1, 20191]) Final batch size: 1, sequence length: 21051 Attention mask shape: torch.Size([1, 1, 21051, 21051]) Position ids shape: torch.Size([1, 21051]) Input IDs shape: torch.Size([1, 21051]) Labels shape: torch.Size([1, 21051]) Final batch size: 1, sequence length: 14892 Attention mask shape: torch.Size([1, 1, 14892, 14892]) Position ids shape: torch.Size([1, 14892]) Input IDs shape: torch.Size([1, 14892]) Labels shape: torch.Size([1, 14892]) Final batch size: 1, sequence length: 20432 Attention mask shape: torch.Size([1, 1, 20432, 20432]) Position ids shape: torch.Size([1, 20432]) Input IDs shape: torch.Size([1, 20432]) Labels shape: torch.Size([1, 20432]) Final batch size: 1, sequence length: 15339 Attention mask shape: torch.Size([1, 1, 15339, 15339]) Position ids shape: torch.Size([1, 15339]) Input IDs shape: torch.Size([1, 15339]) Labels shape: torch.Size([1, 15339]) Final batch size: 1, sequence length: 16585 Attention mask shape: torch.Size([1, 1, 16585, 16585]) Position ids shape: torch.Size([1, 16585]) Input IDs shape: torch.Size([1, 16585]) Labels shape: torch.Size([1, 16585]) Final batch size: 1, sequence length: 22133 Attention mask shape: torch.Size([1, 1, 22133, 22133]) Position ids shape: torch.Size([1, 22133]) Input IDs shape: torch.Size([1, 22133]) Labels shape: torch.Size([1, 22133]) Final batch size: 1, sequence length: 22857 Attention mask shape: torch.Size([1, 1, 22857, 22857]) Position ids shape: torch.Size([1, 22857]) Input IDs shape: torch.Size([1, 22857]) Labels shape: torch.Size([1, 22857]) Final batch size: 1, sequence length: 23238 Attention mask shape: torch.Size([1, 1, 23238, 23238]) Position ids shape: torch.Size([1, 23238]) Input IDs shape: torch.Size([1, 23238]) Labels shape: torch.Size([1, 23238]) Final batch size: 1, sequence length: 24428 Attention mask shape: torch.Size([1, 1, 24428, 24428]) Position ids shape: torch.Size([1, 24428]) Input IDs shape: torch.Size([1, 24428]) Labels shape: torch.Size([1, 24428]) Final batch size: 1, sequence length: 20562 Attention mask shape: torch.Size([1, 1, 20562, 20562]) Position ids shape: torch.Size([1, 20562]) Input IDs shape: torch.Size([1, 20562]) Labels shape: torch.Size([1, 20562]) Final batch size: 1, sequence length: 10182 Attention mask shape: torch.Size([1, 1, 10182, 10182]) Position ids shape: torch.Size([1, 10182]) Input IDs shape: torch.Size([1, 10182]) Labels shape: torch.Size([1, 10182]) Final batch size: 1, sequence length: 24002 Attention mask shape: torch.Size([1, 1, 24002, 24002]) Position ids shape: torch.Size([1, 24002]) Input IDs shape: torch.Size([1, 24002]) Labels shape: torch.Size([1, 24002]) Final batch size: 1, sequence length: 12012 Attention mask shape: torch.Size([1, 1, 12012, 12012]) Position ids shape: torch.Size([1, 12012]) Input IDs shape: torch.Size([1, 12012]) Labels shape: torch.Size([1, 12012]) Final batch size: 1, sequence length: 14556 Attention mask shape: torch.Size([1, 1, 14556, 14556]) Position ids shape: torch.Size([1, 14556]) Input IDs shape: torch.Size([1, 14556]) Labels shape: torch.Size([1, 14556]) Final batch size: 1, sequence length: 24255 Attention mask shape: torch.Size([1, 1, 24255, 24255]) Position ids shape: torch.Size([1, 24255]) Input IDs shape: torch.Size([1, 24255]) Labels shape: torch.Size([1, 24255]) Final batch size: 1, sequence length: 26033 Attention mask shape: torch.Size([1, 1, 26033, 26033]) Position ids shape: torch.Size([1, 26033]) Input IDs shape: torch.Size([1, 26033]) Labels shape: torch.Size([1, 26033]) Final batch size: 1, sequence length: 24769 Attention mask shape: torch.Size([1, 1, 24769, 24769]) Position ids shape: torch.Size([1, 24769]) Input IDs shape: torch.Size([1, 24769]) Labels shape: torch.Size([1, 24769]) Final batch size: 1, sequence length: 26312 Attention mask shape: torch.Size([1, 1, 26312, 26312]) Position ids shape: torch.Size([1, 26312]) Input IDs shape: torch.Size([1, 26312]) Labels shape: torch.Size([1, 26312]) Final batch size: 1, sequence length: 16541 Attention mask shape: torch.Size([1, 1, 16541, 16541]) Position ids shape: torch.Size([1, 16541]) Input IDs shape: torch.Size([1, 16541]) Labels shape: torch.Size([1, 16541]) Final batch size: 1, sequence length: 25707 Attention mask shape: torch.Size([1, 1, 25707, 25707]) Position ids shape: torch.Size([1, 25707]) Input IDs shape: torch.Size([1, 25707]) Labels shape: torch.Size([1, 25707]) Final batch size: 1, sequence length: 24858 Attention mask shape: torch.Size([1, 1, 24858, 24858]) Position ids shape: torch.Size([1, 24858]) Input IDs shape: torch.Size([1, 24858]) Labels shape: torch.Size([1, 24858]) Final batch size: 1, sequence length: 21664 Attention mask shape: torch.Size([1, 1, 21664, 21664]) Position ids shape: torch.Size([1, 21664]) Input IDs shape: torch.Size([1, 21664]) Labels shape: torch.Size([1, 21664]) Final batch size: 1, sequence length: 26144 Attention mask shape: torch.Size([1, 1, 26144, 26144]) Position ids shape: torch.Size([1, 26144]) Input IDs shape: torch.Size([1, 26144]) Labels shape: torch.Size([1, 26144]) Final batch size: 1, sequence length: 19238 Attention mask shape: torch.Size([1, 1, 19238, 19238]) Position ids shape: torch.Size([1, 19238]) Input IDs shape: torch.Size([1, 19238]) Labels shape: torch.Size([1, 19238]) Final batch size: 1, sequence length: 23602 Attention mask shape: torch.Size([1, 1, 23602, 23602]) Position ids shape: torch.Size([1, 23602]) Input IDs shape: torch.Size([1, 23602]) Labels shape: torch.Size([1, 23602]) Final batch size: 1, sequence length: 25886 Attention mask shape: torch.Size([1, 1, 25886, 25886]) Position ids shape: torch.Size([1, 25886]) Input IDs shape: torch.Size([1, 25886]) Labels shape: torch.Size([1, 25886]) Final batch size: 1, sequence length: 30079 Attention mask shape: torch.Size([1, 1, 30079, 30079]) Position ids shape: torch.Size([1, 30079]) Input IDs shape: torch.Size([1, 30079]) Labels shape: torch.Size([1, 30079]) Final batch size: 1, sequence length: 25758 Attention mask shape: torch.Size([1, 1, 25758, 25758]) Position ids shape: torch.Size([1, 25758]) Input IDs shape: torch.Size([1, 25758]) Labels shape: torch.Size([1, 25758]) Final batch size: 1, sequence length: 10318 Attention mask shape: torch.Size([1, 1, 10318, 10318]) Position ids shape: torch.Size([1, 10318]) Input IDs shape: torch.Size([1, 10318]) Labels shape: torch.Size([1, 10318]) Final batch size: 1, sequence length: 29749 Attention mask shape: torch.Size([1, 1, 29749, 29749]) Position ids shape: torch.Size([1, 29749]) Input IDs shape: torch.Size([1, 29749]) Labels shape: torch.Size([1, 29749]) Final batch size: 1, sequence length: 23698 Attention mask shape: torch.Size([1, 1, 23698, 23698]) Position ids shape: torch.Size([1, 23698]) Input IDs shape: torch.Size([1, 23698]) Labels shape: torch.Size([1, 23698]) Final batch size: 1, sequence length: 29948 Attention mask shape: torch.Size([1, 1, 29948, 29948]) Position ids shape: torch.Size([1, 29948]) Input IDs shape: torch.Size([1, 29948]) Labels shape: torch.Size([1, 29948]) Final batch size: 1, sequence length: 32541 Attention mask shape: torch.Size([1, 1, 32541, 32541]) Position ids shape: torch.Size([1, 32541]) Input IDs shape: torch.Size([1, 32541]) Labels shape: torch.Size([1, 32541]) Final batch size: 1, sequence length: 25719 Attention mask shape: torch.Size([1, 1, 25719, 25719]) Position ids shape: torch.Size([1, 25719]) Input IDs shape: torch.Size([1, 25719]) Labels shape: torch.Size([1, 25719]) Final batch size: 1, sequence length: 29825 Attention mask shape: torch.Size([1, 1, 29825, 29825]) Position ids shape: torch.Size([1, 29825]) Input IDs shape: torch.Size([1, 29825]) Labels shape: torch.Size([1, 29825]) Final batch size: 1, sequence length: 21374 Attention mask shape: torch.Size([1, 1, 21374, 21374]) Position ids shape: torch.Size([1, 21374]) Input IDs shape: torch.Size([1, 21374]) Labels shape: torch.Size([1, 21374]) Final batch size: 1, sequence length: 32071 Attention mask shape: torch.Size([1, 1, 32071, 32071]) Position ids shape: torch.Size([1, 32071]) Input IDs shape: torch.Size([1, 32071]) Labels shape: torch.Size([1, 32071]) Final batch size: 1, sequence length: 25656 Attention mask shape: torch.Size([1, 1, 25656, 25656]) Position ids shape: torch.Size([1, 25656]) Input IDs shape: torch.Size([1, 25656]) Labels shape: torch.Size([1, 25656]) Final batch size: 1, sequence length: 15604 Attention mask shape: torch.Size([1, 1, 15604, 15604]) Position ids shape: torch.Size([1, 15604]) Input IDs shape: torch.Size([1, 15604]) Labels shape: torch.Size([1, 15604]) Final batch size: 1, sequence length: 33611 Attention mask shape: torch.Size([1, 1, 33611, 33611]) Position ids shape: torch.Size([1, 33611]) Input IDs shape: torch.Size([1, 33611]) Labels shape: torch.Size([1, 33611]) Final batch size: 1, sequence length: 25492 Attention mask shape: torch.Size([1, 1, 25492, 25492]) Position ids shape: torch.Size([1, 25492]) Input IDs shape: torch.Size([1, 25492]) Labels shape: torch.Size([1, 25492]) Final batch size: 1, sequence length: 22139 Attention mask shape: torch.Size([1, 1, 22139, 22139]) Position ids shape: torch.Size([1, 22139]) Input IDs shape: torch.Size([1, 22139]) Labels shape: torch.Size([1, 22139]) Final batch size: 1, sequence length: 36778 Attention mask shape: torch.Size([1, 1, 36778, 36778]) Position ids shape: torch.Size([1, 36778]) Input IDs shape: torch.Size([1, 36778]) Labels shape: torch.Size([1, 36778]) Final batch size: 1, sequence length: 34701 Attention mask shape: torch.Size([1, 1, 34701, 34701]) Position ids shape: torch.Size([1, 34701]) Input IDs shape: torch.Size([1, 34701]) Labels shape: torch.Size([1, 34701]) Final batch size: 1, sequence length: 9897 Attention mask shape: torch.Size([1, 1, 9897, 9897]) Position ids shape: torch.Size([1, 9897]) Input IDs shape: torch.Size([1, 9897]) Labels shape: torch.Size([1, 9897]) Final batch size: 1, sequence length: 34186 Attention mask shape: torch.Size([1, 1, 34186, 34186]) Position ids shape: torch.Size([1, 34186]) Input IDs shape: torch.Size([1, 34186]) Labels shape: torch.Size([1, 34186]) Final batch size: 1, sequence length: 37241 Attention mask shape: torch.Size([1, 1, 37241, 37241]) Position ids shape: torch.Size([1, 37241]) Input IDs shape: torch.Size([1, 37241]) Labels shape: torch.Size([1, 37241]) Final batch size: 1, sequence length: 35760 Attention mask shape: torch.Size([1, 1, 35760, 35760]) Position ids shape: torch.Size([1, 35760]) Input IDs shape: torch.Size([1, 35760]) Labels shape: torch.Size([1, 35760]) Final batch size: 1, sequence length: 23399 Attention mask shape: torch.Size([1, 1, 23399, 23399]) Position ids shape: torch.Size([1, 23399]) Input IDs shape: torch.Size([1, 23399]) Labels shape: torch.Size([1, 23399]) Final batch size: 1, sequence length: 23343 Attention mask shape: torch.Size([1, 1, 23343, 23343]) Position ids shape: torch.Size([1, 23343]) Input IDs shape: torch.Size([1, 23343]) Labels shape: torch.Size([1, 23343]) Final batch size: 1, sequence length: 37728 Attention mask shape: torch.Size([1, 1, 37728, 37728]) Position ids shape: torch.Size([1, 37728]) Input IDs shape: torch.Size([1, 37728]) Labels shape: torch.Size([1, 37728]) Final batch size: 1, sequence length: 21557 Attention mask shape: torch.Size([1, 1, 21557, 21557]) Position ids shape: torch.Size([1, 21557]) Input IDs shape: torch.Size([1, 21557]) Labels shape: torch.Size([1, 21557]) Final batch size: 1, sequence length: 33459 Attention mask shape: torch.Size([1, 1, 33459, 33459]) Position ids shape: torch.Size([1, 33459]) Input IDs shape: torch.Size([1, 33459]) Labels shape: torch.Size([1, 33459]) Final batch size: 1, sequence length: 40593 Attention mask shape: torch.Size([1, 1, 40593, 40593]) Position ids shape: torch.Size([1, 40593]) Input IDs shape: torch.Size([1, 40593]) Labels shape: torch.Size([1, 40593]) Final batch size: 1, sequence length: 36456 Attention mask shape: torch.Size([1, 1, 36456, 36456]) Position ids shape: torch.Size([1, 36456]) Input IDs shape: torch.Size([1, 36456]) Labels shape: torch.Size([1, 36456]) Final batch size: 1, sequence length: 39760 Attention mask shape: torch.Size([1, 1, 39760, 39760]) Position ids shape: torch.Size([1, 39760]) Input IDs shape: torch.Size([1, 39760]) Labels shape: torch.Size([1, 39760]) Final batch size: 1, sequence length: 7681 Attention mask shape: torch.Size([1, 1, 7681, 7681]) Position ids shape: torch.Size([1, 7681]) Input IDs shape: torch.Size([1, 7681]) Labels shape: torch.Size([1, 7681]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 21955 Attention mask shape: torch.Size([1, 1, 21955, 21955]) Position ids shape: torch.Size([1, 21955]) Input IDs shape: torch.Size([1, 21955]) Labels shape: torch.Size([1, 21955]) Final batch size: 1, sequence length: 15294 Attention mask shape: torch.Size([1, 1, 15294, 15294]) Position ids shape: torch.Size([1, 15294]) Input IDs shape: torch.Size([1, 15294]) Labels shape: torch.Size([1, 15294]) Final batch size: 1, sequence length: 31650 Attention mask shape: torch.Size([1, 1, 31650, 31650]) Position ids shape: torch.Size([1, 31650]) Input IDs shape: torch.Size([1, 31650]) Labels shape: torch.Size([1, 31650]) Final batch size: 1, sequence length: 40507 Attention mask shape: torch.Size([1, 1, 40507, 40507]) Position ids shape: torch.Size([1, 40507]) Input IDs shape: torch.Size([1, 40507]) Labels shape: torch.Size([1, 40507]) Final batch size: 1, sequence length: 31323 Attention mask shape: torch.Size([1, 1, 31323, 31323]) Position ids shape: torch.Size([1, 31323]) Input IDs shape: torch.Size([1, 31323]) Labels shape: torch.Size([1, 31323]) Final batch size: 1, sequence length: 40147 Attention mask shape: torch.Size([1, 1, 40147, 40147]) Position ids shape: torch.Size([1, 40147]) Input IDs shape: torch.Size([1, 40147]) Labels shape: torch.Size([1, 40147]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 27283 Attention mask shape: torch.Size([1, 1, 27283, 27283]) Position ids shape: torch.Size([1, 27283]) Input IDs shape: torch.Size([1, 27283]) Labels shape: torch.Size([1, 27283]) Final batch size: 1, sequence length: 9445 Attention mask shape: torch.Size([1, 1, 9445, 9445]) Position ids shape: torch.Size([1, 9445]) Input IDs shape: torch.Size([1, 9445]) Labels shape: torch.Size([1, 9445]) Final batch size: 1, sequence length: 21489 Attention mask shape: torch.Size([1, 1, 21489, 21489]) Position ids shape: torch.Size([1, 21489]) Input IDs shape: torch.Size([1, 21489]) Labels shape: torch.Size([1, 21489]) Final batch size: 1, sequence length: 29082 Attention mask shape: torch.Size([1, 1, 29082, 29082]) Position ids shape: torch.Size([1, 29082]) Input IDs shape: torch.Size([1, 29082]) Labels shape: torch.Size([1, 29082]) Final batch size: 1, sequence length: 40960 Final batch size: 1, sequence length: 22434 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Attention mask shape: torch.Size([1, 1, 22434, 22434]) Position ids shape: torch.Size([1, 22434]) Input IDs shape: torch.Size([1, 22434]) Labels shape: torch.Size([1, 22434]) Final batch size: 1, sequence length: 22491 Attention mask shape: torch.Size([1, 1, 22491, 22491]) Position ids shape: torch.Size([1, 22491]) Input IDs shape: torch.Size([1, 22491]) Labels shape: torch.Size([1, 22491]) Final batch size: 1, sequence length: 37775 Attention mask shape: torch.Size([1, 1, 37775, 37775]) Position ids shape: torch.Size([1, 37775]) Input IDs shape: torch.Size([1, 37775]) Labels shape: torch.Size([1, 37775]) Final batch size: 1, sequence length: 26664 Attention mask shape: torch.Size([1, 1, 26664, 26664]) Position ids shape: torch.Size([1, 26664]) Input IDs shape: torch.Size([1, 26664]) Labels shape: torch.Size([1, 26664]) Final batch size: 1, sequence length: 17646 Attention mask shape: torch.Size([1, 1, 17646, 17646]) Position ids shape: torch.Size([1, 17646]) Input IDs shape: torch.Size([1, 17646]) Labels shape: torch.Size([1, 17646]) Final batch size: 1, sequence length: 19867 Attention mask shape: torch.Size([1, 1, 19867, 19867]) Position ids shape: torch.Size([1, 19867]) Input IDs shape: torch.Size([1, 19867]) Labels shape: torch.Size([1, 19867]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 37183 Attention mask shape: torch.Size([1, 1, 37183, 37183]) Position ids shape: torch.Size([1, 37183]) Input IDs shape: torch.Size([1, 37183]) Labels shape: torch.Size([1, 37183]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 23014 Attention mask shape: torch.Size([1, 1, 23014, 23014]) Position ids shape: torch.Size([1, 23014]) Input IDs shape: torch.Size([1, 23014]) Labels shape: torch.Size([1, 23014]) Final batch size: 1, sequence length: 16273 Attention mask shape: torch.Size([1, 1, 16273, 16273]) Position ids shape: torch.Size([1, 16273]) Input IDs shape: torch.Size([1, 16273]) Labels shape: torch.Size([1, 16273]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 36579 Attention mask shape: torch.Size([1, 1, 36579, 36579]) Position ids shape: torch.Size([1, 36579]) Input IDs shape: torch.Size([1, 36579]) Labels shape: torch.Size([1, 36579]) Final batch size: 1, sequence length: 28661 Attention mask shape: torch.Size([1, 1, 28661, 28661]) Position ids shape: torch.Size([1, 28661]) Input IDs shape: torch.Size([1, 28661]) Labels shape: torch.Size([1, 28661]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 33263 Attention mask shape: torch.Size([1, 1, 33263, 33263]) Position ids shape: torch.Size([1, 33263]) Input IDs shape: torch.Size([1, 33263]) Labels shape: torch.Size([1, 33263]) Final batch size: 1, sequence length: 38686 Attention mask shape: torch.Size([1, 1, 38686, 38686]) Position ids shape: torch.Size([1, 38686]) Input IDs shape: torch.Size([1, 38686]) Labels shape: torch.Size([1, 38686]) Final batch size: 1, sequence length: 37091 Attention mask shape: torch.Size([1, 1, 37091, 37091]) Position ids shape: torch.Size([1, 37091]) Input IDs shape: torch.Size([1, 37091]) Labels shape: torch.Size([1, 37091]) Final batch size: 1, sequence length: 25914 Attention mask shape: torch.Size([1, 1, 25914, 25914]) Position ids shape: torch.Size([1, 25914]) Input IDs shape: torch.Size([1, 25914]) Labels shape: torch.Size([1, 25914]) Final batch size: 1, sequence length: 30364 Attention mask shape: torch.Size([1, 1, 30364, 30364]) Position ids shape: torch.Size([1, 30364]) Input IDs shape: torch.Size([1, 30364]) Labels shape: torch.Size([1, 30364]) Final batch size: 1, sequence length: 36297 Attention mask shape: torch.Size([1, 1, 36297, 36297]) Position ids shape: torch.Size([1, 36297]) Input IDs shape: torch.Size([1, 36297]) Labels shape: torch.Size([1, 36297]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) {'loss': 0.3397, 'grad_norm': 0.6634835667448887, 'learning_rate': 9.667902132486009e-06, 'num_tokens': -inf, 'epoch': 1.5} Final batch size: 1, sequence length: 6519 Attention mask shape: torch.Size([1, 1, 6519, 6519]) Position ids shape: torch.Size([1, 6519]) Input IDs shape: torch.Size([1, 6519]) Labels shape: torch.Size([1, 6519]) Final batch size: 1, sequence length: 5525 Attention mask shape: torch.Size([1, 1, 5525, 5525]) Position ids shape: torch.Size([1, 5525]) Input IDs shape: torch.Size([1, 5525]) Labels shape: torch.Size([1, 5525]) Final batch size: 1, sequence length: 10273 Attention mask shape: torch.Size([1, 1, 10273, 10273]) Position ids shape: torch.Size([1, 10273]) Input IDs shape: torch.Size([1, 10273]) Labels shape: torch.Size([1, 10273]) Final batch size: 1, sequence length: 9181 Attention mask shape: torch.Size([1, 1, 9181, 9181]) Position ids shape: torch.Size([1, 9181]) Input IDs shape: torch.Size([1, 9181]) Labels shape: torch.Size([1, 9181]) Final batch size: 1, sequence length: 10408 Attention mask shape: torch.Size([1, 1, 10408, 10408]) Position ids shape: torch.Size([1, 10408]) Input IDs shape: torch.Size([1, 10408]) Labels shape: torch.Size([1, 10408]) Final batch size: 1, sequence length: 12281 Attention mask shape: torch.Size([1, 1, 12281, 12281]) Position ids shape: torch.Size([1, 12281]) Input IDs shape: torch.Size([1, 12281]) Labels shape: torch.Size([1, 12281]) Final batch size: 1, sequence length: 12927 Attention mask shape: torch.Size([1, 1, 12927, 12927]) Position ids shape: torch.Size([1, 12927]) Input IDs shape: torch.Size([1, 12927]) Labels shape: torch.Size([1, 12927]) Final batch size: 1, sequence length: 13363 Attention mask shape: torch.Size([1, 1, 13363, 13363]) Position ids shape: torch.Size([1, 13363]) Input IDs shape: torch.Size([1, 13363]) Labels shape: torch.Size([1, 13363]) Final batch size: 1, sequence length: 13385 Attention mask shape: torch.Size([1, 1, 13385, 13385]) Position ids shape: torch.Size([1, 13385]) Input IDs shape: torch.Size([1, 13385]) Labels shape: torch.Size([1, 13385]) Final batch size: 1, sequence length: 13804 Attention mask shape: torch.Size([1, 1, 13804, 13804]) Position ids shape: torch.Size([1, 13804]) Input IDs shape: torch.Size([1, 13804]) Labels shape: torch.Size([1, 13804]) Final batch size: 1, sequence length: 15257 Attention mask shape: torch.Size([1, 1, 15257, 15257]) Position ids shape: torch.Size([1, 15257]) Input IDs shape: torch.Size([1, 15257]) Labels shape: torch.Size([1, 15257]) Final batch size: 1, sequence length: 10905 Attention mask shape: torch.Size([1, 1, 10905, 10905]) Position ids shape: torch.Size([1, 10905]) Input IDs shape: torch.Size([1, 10905]) Labels shape: torch.Size([1, 10905]) Final batch size: 1, sequence length: 17003 Attention mask shape: torch.Size([1, 1, 17003, 17003]) Position ids shape: torch.Size([1, 17003]) Input IDs shape: torch.Size([1, 17003]) Labels shape: torch.Size([1, 17003]) Final batch size: 1, sequence length: 15518 Attention mask shape: torch.Size([1, 1, 15518, 15518]) Position ids shape: torch.Size([1, 15518]) Input IDs shape: torch.Size([1, 15518]) Labels shape: torch.Size([1, 15518]) Final batch size: 1, sequence length: 16520 Attention mask shape: torch.Size([1, 1, 16520, 16520]) Position ids shape: torch.Size([1, 16520]) Input IDs shape: torch.Size([1, 16520]) Labels shape: torch.Size([1, 16520]) Final batch size: 1, sequence length: 19768 Attention mask shape: torch.Size([1, 1, 19768, 19768]) Position ids shape: torch.Size([1, 19768]) Input IDs shape: torch.Size([1, 19768]) Labels shape: torch.Size([1, 19768]) Final batch size: 1, sequence length: 19286 Attention mask shape: torch.Size([1, 1, 19286, 19286]) Position ids shape: torch.Size([1, 19286]) Input IDs shape: torch.Size([1, 19286]) Labels shape: torch.Size([1, 19286]) Final batch size: 1, sequence length: 15244 Attention mask shape: torch.Size([1, 1, 15244, 15244]) Position ids shape: torch.Size([1, 15244]) Input IDs shape: torch.Size([1, 15244]) Labels shape: torch.Size([1, 15244]) Final batch size: 1, sequence length: 8646 Attention mask shape: torch.Size([1, 1, 8646, 8646]) Position ids shape: torch.Size([1, 8646]) Input IDs shape: torch.Size([1, 8646]) Labels shape: torch.Size([1, 8646]) Final batch size: 1, sequence length: 20089 Attention mask shape: torch.Size([1, 1, 20089, 20089]) Position ids shape: torch.Size([1, 20089]) Input IDs shape: torch.Size([1, 20089]) Labels shape: torch.Size([1, 20089]) Final batch size: 1, sequence length: 5405 Attention mask shape: torch.Size([1, 1, 5405, 5405]) Position ids shape: torch.Size([1, 5405]) Input IDs shape: torch.Size([1, 5405]) Labels shape: torch.Size([1, 5405]) Final batch size: 1, sequence length: 18819 Attention mask shape: torch.Size([1, 1, 18819, 18819]) Position ids shape: torch.Size([1, 18819]) Input IDs shape: torch.Size([1, 18819]) Labels shape: torch.Size([1, 18819]) Final batch size: 1, sequence length: 18645 Attention mask shape: torch.Size([1, 1, 18645, 18645]) Position ids shape: torch.Size([1, 18645]) Input IDs shape: torch.Size([1, 18645]) Labels shape: torch.Size([1, 18645]) Final batch size: 1, sequence length: 19513 Attention mask shape: torch.Size([1, 1, 19513, 19513]) Position ids shape: torch.Size([1, 19513]) Input IDs shape: torch.Size([1, 19513]) Labels shape: torch.Size([1, 19513]) Final batch size: 1, sequence length: 23886 Attention mask shape: torch.Size([1, 1, 23886, 23886]) Position ids shape: torch.Size([1, 23886]) Input IDs shape: torch.Size([1, 23886]) Labels shape: torch.Size([1, 23886]) Final batch size: 1, sequence length: 14429 Attention mask shape: torch.Size([1, 1, 14429, 14429]) Position ids shape: torch.Size([1, 14429]) Input IDs shape: torch.Size([1, 14429]) Labels shape: torch.Size([1, 14429]) Final batch size: 1, sequence length: 23942 Attention mask shape: torch.Size([1, 1, 23942, 23942]) Position ids shape: torch.Size([1, 23942]) Input IDs shape: torch.Size([1, 23942]) Labels shape: torch.Size([1, 23942]) Final batch size: 1, sequence length: 20979 Attention mask shape: torch.Size([1, 1, 20979, 20979]) Position ids shape: torch.Size([1, 20979]) Input IDs shape: torch.Size([1, 20979]) Labels shape: torch.Size([1, 20979]) Final batch size: 1, sequence length: 22311 Attention mask shape: torch.Size([1, 1, 22311, 22311]) Position ids shape: torch.Size([1, 22311]) Input IDs shape: torch.Size([1, 22311]) Labels shape: torch.Size([1, 22311]) Final batch size: 1, sequence length: 20106 Attention mask shape: torch.Size([1, 1, 20106, 20106]) Position ids shape: torch.Size([1, 20106]) Input IDs shape: torch.Size([1, 20106]) Labels shape: torch.Size([1, 20106]) Final batch size: 1, sequence length: 23995 Attention mask shape: torch.Size([1, 1, 23995, 23995]) Position ids shape: torch.Size([1, 23995]) Input IDs shape: torch.Size([1, 23995]) Labels shape: torch.Size([1, 23995]) Final batch size: 1, sequence length: 25001 Attention mask shape: torch.Size([1, 1, 25001, 25001]) Position ids shape: torch.Size([1, 25001]) Input IDs shape: torch.Size([1, 25001]) Labels shape: torch.Size([1, 25001]) Final batch size: 1, sequence length: 18988 Attention mask shape: torch.Size([1, 1, 18988, 18988]) Position ids shape: torch.Size([1, 18988]) Input IDs shape: torch.Size([1, 18988]) Labels shape: torch.Size([1, 18988]) Final batch size: 1, sequence length: 22777 Attention mask shape: torch.Size([1, 1, 22777, 22777]) Position ids shape: torch.Size([1, 22777]) Input IDs shape: torch.Size([1, 22777]) Labels shape: torch.Size([1, 22777]) Final batch size: 1, sequence length: 22160 Attention mask shape: torch.Size([1, 1, 22160, 22160]) Position ids shape: torch.Size([1, 22160]) Input IDs shape: torch.Size([1, 22160]) Labels shape: torch.Size([1, 22160]) Final batch size: 1, sequence length: 25021 Attention mask shape: torch.Size([1, 1, 25021, 25021]) Position ids shape: torch.Size([1, 25021]) Input IDs shape: torch.Size([1, 25021]) Labels shape: torch.Size([1, 25021]) Final batch size: 1, sequence length: 23341 Attention mask shape: torch.Size([1, 1, 23341, 23341]) Position ids shape: torch.Size([1, 23341]) Input IDs shape: torch.Size([1, 23341]) Labels shape: torch.Size([1, 23341]) Final batch size: 1, sequence length: 24293 Attention mask shape: torch.Size([1, 1, 24293, 24293]) Position ids shape: torch.Size([1, 24293]) Input IDs shape: torch.Size([1, 24293]) Labels shape: torch.Size([1, 24293]) Final batch size: 1, sequence length: 22915 Attention mask shape: torch.Size([1, 1, 22915, 22915]) Position ids shape: torch.Size([1, 22915]) Input IDs shape: torch.Size([1, 22915]) Labels shape: torch.Size([1, 22915]) Final batch size: 1, sequence length: 21660 Attention mask shape: torch.Size([1, 1, 21660, 21660]) Position ids shape: torch.Size([1, 21660]) Input IDs shape: torch.Size([1, 21660]) Labels shape: torch.Size([1, 21660]) Final batch size: 1, sequence length: 22979 Attention mask shape: torch.Size([1, 1, 22979, 22979]) Position ids shape: torch.Size([1, 22979]) Input IDs shape: torch.Size([1, 22979]) Labels shape: torch.Size([1, 22979]) Final batch size: 1, sequence length: 13600 Attention mask shape: torch.Size([1, 1, 13600, 13600]) Position ids shape: torch.Size([1, 13600]) Input IDs shape: torch.Size([1, 13600]) Labels shape: torch.Size([1, 13600]) Final batch size: 1, sequence length: 12483 Attention mask shape: torch.Size([1, 1, 12483, 12483]) Position ids shape: torch.Size([1, 12483]) Input IDs shape: torch.Size([1, 12483]) Labels shape: torch.Size([1, 12483]) Final batch size: 1, sequence length: 24287 Attention mask shape: torch.Size([1, 1, 24287, 24287]) Position ids shape: torch.Size([1, 24287]) Input IDs shape: torch.Size([1, 24287]) Labels shape: torch.Size([1, 24287]) Final batch size: 1, sequence length: 23724 Attention mask shape: torch.Size([1, 1, 23724, 23724]) Position ids shape: torch.Size([1, 23724]) Input IDs shape: torch.Size([1, 23724]) Labels shape: torch.Size([1, 23724]) Final batch size: 1, sequence length: 26179 Attention mask shape: torch.Size([1, 1, 26179, 26179]) Position ids shape: torch.Size([1, 26179]) Input IDs shape: torch.Size([1, 26179]) Labels shape: torch.Size([1, 26179]) Final batch size: 1, sequence length: 5288 Attention mask shape: torch.Size([1, 1, 5288, 5288]) Position ids shape: torch.Size([1, 5288]) Input IDs shape: torch.Size([1, 5288]) Labels shape: torch.Size([1, 5288]) Final batch size: 1, sequence length: 26054 Attention mask shape: torch.Size([1, 1, 26054, 26054]) Position ids shape: torch.Size([1, 26054]) Input IDs shape: torch.Size([1, 26054]) Labels shape: torch.Size([1, 26054]) Final batch size: 1, sequence length: 24407 Attention mask shape: torch.Size([1, 1, 24407, 24407]) Position ids shape: torch.Size([1, 24407]) Input IDs shape: torch.Size([1, 24407]) Labels shape: torch.Size([1, 24407]) Final batch size: 1, sequence length: 11266 Attention mask shape: torch.Size([1, 1, 11266, 11266]) Position ids shape: torch.Size([1, 11266]) Input IDs shape: torch.Size([1, 11266]) Labels shape: torch.Size([1, 11266]) Final batch size: 1, sequence length: 28440 Attention mask shape: torch.Size([1, 1, 28440, 28440]) Position ids shape: torch.Size([1, 28440]) Input IDs shape: torch.Size([1, 28440]) Labels shape: torch.Size([1, 28440]) Final batch size: 1, sequence length: 29009 Attention mask shape: torch.Size([1, 1, 29009, 29009]) Position ids shape: torch.Size([1, 29009]) Input IDs shape: torch.Size([1, 29009]) Labels shape: torch.Size([1, 29009]) Final batch size: 1, sequence length: 26461 Attention mask shape: torch.Size([1, 1, 26461, 26461]) Position ids shape: torch.Size([1, 26461]) Input IDs shape: torch.Size([1, 26461]) Labels shape: torch.Size([1, 26461]) Final batch size: 1, sequence length: 20941 Attention mask shape: torch.Size([1, 1, 20941, 20941]) Position ids shape: torch.Size([1, 20941]) Input IDs shape: torch.Size([1, 20941]) Labels shape: torch.Size([1, 20941]) Final batch size: 1, sequence length: 28263 Attention mask shape: torch.Size([1, 1, 28263, 28263]) Position ids shape: torch.Size([1, 28263]) Input IDs shape: torch.Size([1, 28263]) Labels shape: torch.Size([1, 28263]) Final batch size: 1, sequence length: 30723 Attention mask shape: torch.Size([1, 1, 30723, 30723]) Position ids shape: torch.Size([1, 30723]) Input IDs shape: torch.Size([1, 30723]) Labels shape: torch.Size([1, 30723]) Final batch size: 1, sequence length: 29237 Attention mask shape: torch.Size([1, 1, 29237, 29237]) Position ids shape: torch.Size([1, 29237]) Input IDs shape: torch.Size([1, 29237]) Labels shape: torch.Size([1, 29237]) Final batch size: 1, sequence length: 30236 Attention mask shape: torch.Size([1, 1, 30236, 30236]) Position ids shape: torch.Size([1, 30236]) Input IDs shape: torch.Size([1, 30236]) Labels shape: torch.Size([1, 30236]) Final batch size: 1, sequence length: 17951 Attention mask shape: torch.Size([1, 1, 17951, 17951]) Position ids shape: torch.Size([1, 17951]) Input IDs shape: torch.Size([1, 17951]) Labels shape: torch.Size([1, 17951]) Final batch size: 1, sequence length: 32799 Attention mask shape: torch.Size([1, 1, 32799, 32799]) Position ids shape: torch.Size([1, 32799]) Input IDs shape: torch.Size([1, 32799]) Labels shape: torch.Size([1, 32799]) Final batch size: 1, sequence length: 26886 Attention mask shape: torch.Size([1, 1, 26886, 26886]) Position ids shape: torch.Size([1, 26886]) Input IDs shape: torch.Size([1, 26886]) Labels shape: torch.Size([1, 26886]) Final batch size: 1, sequence length: 32083 Attention mask shape: torch.Size([1, 1, 32083, 32083]) Position ids shape: torch.Size([1, 32083]) Input IDs shape: torch.Size([1, 32083]) Labels shape: torch.Size([1, 32083]) Final batch size: 1, sequence length: 17539 Attention mask shape: torch.Size([1, 1, 17539, 17539]) Position ids shape: torch.Size([1, 17539]) Input IDs shape: torch.Size([1, 17539]) Labels shape: torch.Size([1, 17539]) Final batch size: 1, sequence length: 21472 Attention mask shape: torch.Size([1, 1, 21472, 21472]) Position ids shape: torch.Size([1, 21472]) Input IDs shape: torch.Size([1, 21472]) Labels shape: torch.Size([1, 21472]) Final batch size: 1, sequence length: 27405 Attention mask shape: torch.Size([1, 1, 27405, 27405]) Position ids shape: torch.Size([1, 27405]) Input IDs shape: torch.Size([1, 27405]) Labels shape: torch.Size([1, 27405]) Final batch size: 1, sequence length: 32660 Attention mask shape: torch.Size([1, 1, 32660, 32660]) Position ids shape: torch.Size([1, 32660]) Input IDs shape: torch.Size([1, 32660]) Labels shape: torch.Size([1, 32660]) Final batch size: 1, sequence length: 29109 Attention mask shape: torch.Size([1, 1, 29109, 29109]) Position ids shape: torch.Size([1, 29109]) Input IDs shape: torch.Size([1, 29109]) Labels shape: torch.Size([1, 29109]) Final batch size: 1, sequence length: 29880 Attention mask shape: torch.Size([1, 1, 29880, 29880]) Position ids shape: torch.Size([1, 29880]) Input IDs shape: torch.Size([1, 29880]) Labels shape: torch.Size([1, 29880]) Final batch size: 1, sequence length: 32287 Attention mask shape: torch.Size([1, 1, 32287, 32287]) Position ids shape: torch.Size([1, 32287]) Input IDs shape: torch.Size([1, 32287]) Labels shape: torch.Size([1, 32287]) Final batch size: 1, sequence length: 29152 Attention mask shape: torch.Size([1, 1, 29152, 29152]) Position ids shape: torch.Size([1, 29152]) Input IDs shape: torch.Size([1, 29152]) Labels shape: torch.Size([1, 29152]) Final batch size: 1, sequence length: 17649 Attention mask shape: torch.Size([1, 1, 17649, 17649]) Position ids shape: torch.Size([1, 17649]) Input IDs shape: torch.Size([1, 17649]) Labels shape: torch.Size([1, 17649]) Final batch size: 1, sequence length: 15830 Attention mask shape: torch.Size([1, 1, 15830, 15830]) Position ids shape: torch.Size([1, 15830]) Input IDs shape: torch.Size([1, 15830]) Labels shape: torch.Size([1, 15830]) Final batch size: 1, sequence length: 31512 Attention mask shape: torch.Size([1, 1, 31512, 31512]) Position ids shape: torch.Size([1, 31512]) Input IDs shape: torch.Size([1, 31512]) Labels shape: torch.Size([1, 31512]) Final batch size: 1, sequence length: 29561 Attention mask shape: torch.Size([1, 1, 29561, 29561]) Position ids shape: torch.Size([1, 29561]) Input IDs shape: torch.Size([1, 29561]) Labels shape: torch.Size([1, 29561]) Final batch size: 1, sequence length: 26452 Attention mask shape: torch.Size([1, 1, 26452, 26452]) Position ids shape: torch.Size([1, 26452]) Input IDs shape: torch.Size([1, 26452]) Labels shape: torch.Size([1, 26452]) Final batch size: 1, sequence length: 27987 Attention mask shape: torch.Size([1, 1, 27987, 27987]) Position ids shape: torch.Size([1, 27987]) Input IDs shape: torch.Size([1, 27987]) Labels shape: torch.Size([1, 27987]) Final batch size: 1, sequence length: 8008 Attention mask shape: torch.Size([1, 1, 8008, 8008]) Position ids shape: torch.Size([1, 8008]) Input IDs shape: torch.Size([1, 8008]) Labels shape: torch.Size([1, 8008]) Final batch size: 1, sequence length: 35696 Attention mask shape: torch.Size([1, 1, 35696, 35696]) Position ids shape: torch.Size([1, 35696]) Input IDs shape: torch.Size([1, 35696]) Labels shape: torch.Size([1, 35696]) Final batch size: 1, sequence length: 35478 Attention mask shape: torch.Size([1, 1, 35478, 35478]) Position ids shape: torch.Size([1, 35478]) Input IDs shape: torch.Size([1, 35478]) Labels shape: torch.Size([1, 35478]) Final batch size: 1, sequence length: 34694 Attention mask shape: torch.Size([1, 1, 34694, 34694]) Position ids shape: torch.Size([1, 34694]) Input IDs shape: torch.Size([1, 34694]) Labels shape: torch.Size([1, 34694]) Final batch size: 1, sequence length: 15906 Attention mask shape: torch.Size([1, 1, 15906, 15906]) Position ids shape: torch.Size([1, 15906]) Input IDs shape: torch.Size([1, 15906]) Labels shape: torch.Size([1, 15906]) Final batch size: 1, sequence length: 35999 Attention mask shape: torch.Size([1, 1, 35999, 35999]) Position ids shape: torch.Size([1, 35999]) Input IDs shape: torch.Size([1, 35999]) Labels shape: torch.Size([1, 35999]) Final batch size: 1, sequence length: 24056 Attention mask shape: torch.Size([1, 1, 24056, 24056]) Position ids shape: torch.Size([1, 24056]) Input IDs shape: torch.Size([1, 24056]) Labels shape: torch.Size([1, 24056]) Final batch size: 1, sequence length: 17775 Attention mask shape: torch.Size([1, 1, 17775, 17775]) Position ids shape: torch.Size([1, 17775]) Input IDs shape: torch.Size([1, 17775]) Labels shape: torch.Size([1, 17775]) Final batch size: 1, sequence length: 23945 Attention mask shape: torch.Size([1, 1, 23945, 23945]) Position ids shape: torch.Size([1, 23945]) Input IDs shape: torch.Size([1, 23945]) Labels shape: torch.Size([1, 23945]) Final batch size: 1, sequence length: 28879 Attention mask shape: torch.Size([1, 1, 28879, 28879]) Position ids shape: torch.Size([1, 28879]) Input IDs shape: torch.Size([1, 28879]) Labels shape: torch.Size([1, 28879]) Final batch size: 1, sequence length: 36314 Attention mask shape: torch.Size([1, 1, 36314, 36314]) Position ids shape: torch.Size([1, 36314]) Input IDs shape: torch.Size([1, 36314]) Labels shape: torch.Size([1, 36314]) Final batch size: 1, sequence length: 29262 Attention mask shape: torch.Size([1, 1, 29262, 29262]) Position ids shape: torch.Size([1, 29262]) Input IDs shape: torch.Size([1, 29262]) Labels shape: torch.Size([1, 29262]) Final batch size: 1, sequence length: 16852 Attention mask shape: torch.Size([1, 1, 16852, 16852]) Position ids shape: torch.Size([1, 16852]) Input IDs shape: torch.Size([1, 16852]) Labels shape: torch.Size([1, 16852]) Final batch size: 1, sequence length: 31786 Attention mask shape: torch.Size([1, 1, 31786, 31786]) Position ids shape: torch.Size([1, 31786]) Input IDs shape: torch.Size([1, 31786]) Labels shape: torch.Size([1, 31786]) Final batch size: 1, sequence length: 39476 Attention mask shape: torch.Size([1, 1, 39476, 39476]) Position ids shape: torch.Size([1, 39476]) Input IDs shape: torch.Size([1, 39476]) Labels shape: torch.Size([1, 39476]) Final batch size: 1, sequence length: 7448 Attention mask shape: torch.Size([1, 1, 7448, 7448]) Position ids shape: torch.Size([1, 7448]) Input IDs shape: torch.Size([1, 7448]) Labels shape: torch.Size([1, 7448]) Final batch size: 1, sequence length: 16631 Attention mask shape: torch.Size([1, 1, 16631, 16631]) Position ids shape: torch.Size([1, 16631]) Input IDs shape: torch.Size([1, 16631]) Labels shape: torch.Size([1, 16631]) Final batch size: 1, sequence length: 28691 Attention mask shape: torch.Size([1, 1, 28691, 28691]) Position ids shape: torch.Size([1, 28691]) Input IDs shape: torch.Size([1, 28691]) Labels shape: torch.Size([1, 28691]) Final batch size: 1, sequence length: 35526 Attention mask shape: torch.Size([1, 1, 35526, 35526]) Position ids shape: torch.Size([1, 35526]) Input IDs shape: torch.Size([1, 35526]) Labels shape: torch.Size([1, 35526]) Final batch size: 1, sequence length: 6378 Attention mask shape: torch.Size([1, 1, 6378, 6378]) Position ids shape: torch.Size([1, 6378]) Input IDs shape: torch.Size([1, 6378]) Labels shape: torch.Size([1, 6378]) Final batch size: 1, sequence length: 34777 Attention mask shape: torch.Size([1, 1, 34777, 34777]) Position ids shape: torch.Size([1, 34777]) Input IDs shape: torch.Size([1, 34777]) Labels shape: torch.Size([1, 34777]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 10480 Attention mask shape: torch.Size([1, 1, 10480, 10480]) Position ids shape: torch.Size([1, 10480]) Input IDs shape: torch.Size([1, 10480]) Labels shape: torch.Size([1, 10480]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 31132 Attention mask shape: torch.Size([1, 1, 31132, 31132]) Position ids shape: torch.Size([1, 31132]) Input IDs shape: torch.Size([1, 31132]) Labels shape: torch.Size([1, 31132]) Final batch size: 1, sequence length: 11395 Attention mask shape: torch.Size([1, 1, 11395, 11395]) Position ids shape: torch.Size([1, 11395]) Input IDs shape: torch.Size([1, 11395]) Labels shape: torch.Size([1, 11395]) Final batch size: 1, sequence length: 24824 Attention mask shape: torch.Size([1, 1, 24824, 24824]) Position ids shape: torch.Size([1, 24824]) Input IDs shape: torch.Size([1, 24824]) Labels shape: torch.Size([1, 24824]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 37933 Attention mask shape: torch.Size([1, 1, 37933, 37933]) Position ids shape: torch.Size([1, 37933]) Input IDs shape: torch.Size([1, 37933]) Labels shape: torch.Size([1, 37933]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 28495 Attention mask shape: torch.Size([1, 1, 28495, 28495]) Position ids shape: torch.Size([1, 28495]) Input IDs shape: torch.Size([1, 28495]) Labels shape: torch.Size([1, 28495]) Final batch size: 1, sequence length: 20176 Attention mask shape: torch.Size([1, 1, 20176, 20176]) Position ids shape: torch.Size([1, 20176]) Input IDs shape: torch.Size([1, 20176]) Labels shape: torch.Size([1, 20176]) Final batch size: 1, sequence length: 20694 Attention mask shape: torch.Size([1, 1, 20694, 20694]) Position ids shape: torch.Size([1, 20694]) Input IDs shape: torch.Size([1, 20694]) Labels shape: torch.Size([1, 20694]) Final batch size: 1, sequence length: 18127 Attention mask shape: torch.Size([1, 1, 18127, 18127]) Position ids shape: torch.Size([1, 18127]) Input IDs shape: torch.Size([1, 18127]) Labels shape: torch.Size([1, 18127]) Final batch size: 1, sequence length: 38935 Attention mask shape: torch.Size([1, 1, 38935, 38935]) Position ids shape: torch.Size([1, 38935]) Input IDs shape: torch.Size([1, 38935]) Labels shape: torch.Size([1, 38935]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 30024 Attention mask shape: torch.Size([1, 1, 30024, 30024]) Position ids shape: torch.Size([1, 30024]) Input IDs shape: torch.Size([1, 30024]) Labels shape: torch.Size([1, 30024]) Final batch size: 1, sequence length: 18911 Attention mask shape: torch.Size([1, 1, 18911, 18911]) Position ids shape: torch.Size([1, 18911]) Input IDs shape: torch.Size([1, 18911]) Labels shape: torch.Size([1, 18911]) Final batch size: 1, sequence length: 14869 Attention mask shape: torch.Size([1, 1, 14869, 14869]) Position ids shape: torch.Size([1, 14869]) Input IDs shape: torch.Size([1, 14869]) Labels shape: torch.Size([1, 14869]) Final batch size: 1, sequence length: 40605 Attention mask shape: torch.Size([1, 1, 40605, 40605]) Position ids shape: torch.Size([1, 40605]) Input IDs shape: torch.Size([1, 40605]) Labels shape: torch.Size([1, 40605]) Final batch size: 1, sequence length: 37159 Attention mask shape: torch.Size([1, 1, 37159, 37159]) Position ids shape: torch.Size([1, 37159]) Input IDs shape: torch.Size([1, 37159]) Labels shape: torch.Size([1, 37159]) Final batch size: 1, sequence length: 39324 Attention mask shape: torch.Size([1, 1, 39324, 39324]) Position ids shape: torch.Size([1, 39324]) Input IDs shape: torch.Size([1, 39324]) Labels shape: torch.Size([1, 39324]) Final batch size: 1, sequence length: 35116 Attention mask shape: torch.Size([1, 1, 35116, 35116]) Position ids shape: torch.Size([1, 35116]) Input IDs shape: torch.Size([1, 35116]) Labels shape: torch.Size([1, 35116]) Final batch size: 1, sequence length: 28313 Attention mask shape: torch.Size([1, 1, 28313, 28313]) Position ids shape: torch.Size([1, 28313]) Input IDs shape: torch.Size([1, 28313]) Labels shape: torch.Size([1, 28313]) Final batch size: 1, sequence length: 36469 Attention mask shape: torch.Size([1, 1, 36469, 36469]) Position ids shape: torch.Size([1, 36469]) Input IDs shape: torch.Size([1, 36469]) Labels shape: torch.Size([1, 36469]) Final batch size: 1, sequence length: 39687 Attention mask shape: torch.Size([1, 1, 39687, 39687]) Position ids shape: torch.Size([1, 39687]) Input IDs shape: torch.Size([1, 39687]) Labels shape: torch.Size([1, 39687]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 30245 Attention mask shape: torch.Size([1, 1, 30245, 30245]) Position ids shape: torch.Size([1, 30245]) Input IDs shape: torch.Size([1, 30245]) Labels shape: torch.Size([1, 30245]) Final batch size: 1, sequence length: 33070 Attention mask shape: torch.Size([1, 1, 33070, 33070]) Position ids shape: torch.Size([1, 33070]) Input IDs shape: torch.Size([1, 33070]) Labels shape: torch.Size([1, 33070]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 26402 Attention mask shape: torch.Size([1, 1, 26402, 26402]) Position ids shape: torch.Size([1, 26402]) Input IDs shape: torch.Size([1, 26402]) Labels shape: torch.Size([1, 26402]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) {'loss': 0.3133, 'grad_norm': 0.5262692418688812, 'learning_rate': 9.567727288213005e-06, 'num_tokens': -inf, 'epoch': 1.62} Final batch size: 1, sequence length: 3968 Attention mask shape: torch.Size([1, 1, 3968, 3968]) Position ids shape: torch.Size([1, 3968]) Input IDs shape: torch.Size([1, 3968]) Labels shape: torch.Size([1, 3968]) Final batch size: 1, sequence length: 6567 Attention mask shape: torch.Size([1, 1, 6567, 6567]) Position ids shape: torch.Size([1, 6567]) Input IDs shape: torch.Size([1, 6567]) Labels shape: torch.Size([1, 6567]) Final batch size: 1, sequence length: 8347 Attention mask shape: torch.Size([1, 1, 8347, 8347]) Position ids shape: torch.Size([1, 8347]) Input IDs shape: torch.Size([1, 8347]) Labels shape: torch.Size([1, 8347]) Final batch size: 1, sequence length: 5997 Attention mask shape: torch.Size([1, 1, 5997, 5997]) Position ids shape: torch.Size([1, 5997]) Input IDs shape: torch.Size([1, 5997]) Labels shape: torch.Size([1, 5997]) Final batch size: 1, sequence length: 9027 Attention mask shape: torch.Size([1, 1, 9027, 9027]) Position ids shape: torch.Size([1, 9027]) Input IDs shape: torch.Size([1, 9027]) Labels shape: torch.Size([1, 9027]) Final batch size: 1, sequence length: 11752 Attention mask shape: torch.Size([1, 1, 11752, 11752]) Position ids shape: torch.Size([1, 11752]) Input IDs shape: torch.Size([1, 11752]) Labels shape: torch.Size([1, 11752]) Final batch size: 1, sequence length: 11054 Attention mask shape: torch.Size([1, 1, 11054, 11054]) Position ids shape: torch.Size([1, 11054]) Input IDs shape: torch.Size([1, 11054]) Labels shape: torch.Size([1, 11054]) Final batch size: 1, sequence length: 12967 Attention mask shape: torch.Size([1, 1, 12967, 12967]) Position ids shape: torch.Size([1, 12967]) Input IDs shape: torch.Size([1, 12967]) Labels shape: torch.Size([1, 12967]) Final batch size: 1, sequence length: 11232 Attention mask shape: torch.Size([1, 1, 11232, 11232]) Position ids shape: torch.Size([1, 11232]) Input IDs shape: torch.Size([1, 11232]) Labels shape: torch.Size([1, 11232]) Final batch size: 1, sequence length: 14758 Attention mask shape: torch.Size([1, 1, 14758, 14758]) Position ids shape: torch.Size([1, 14758]) Input IDs shape: torch.Size([1, 14758]) Labels shape: torch.Size([1, 14758]) Final batch size: 1, sequence length: 12370 Attention mask shape: torch.Size([1, 1, 12370, 12370]) Position ids shape: torch.Size([1, 12370]) Input IDs shape: torch.Size([1, 12370]) Labels shape: torch.Size([1, 12370]) Final batch size: 1, sequence length: 11880 Attention mask shape: torch.Size([1, 1, 11880, 11880]) Position ids shape: torch.Size([1, 11880]) Input IDs shape: torch.Size([1, 11880]) Labels shape: torch.Size([1, 11880]) Final batch size: 1, sequence length: 14512 Attention mask shape: torch.Size([1, 1, 14512, 14512]) Position ids shape: torch.Size([1, 14512]) Input IDs shape: torch.Size([1, 14512]) Labels shape: torch.Size([1, 14512]) Final batch size: 1, sequence length: 13260 Attention mask shape: torch.Size([1, 1, 13260, 13260]) Position ids shape: torch.Size([1, 13260]) Input IDs shape: torch.Size([1, 13260]) Labels shape: torch.Size([1, 13260]) Final batch size: 1, sequence length: 14597 Attention mask shape: torch.Size([1, 1, 14597, 14597]) Position ids shape: torch.Size([1, 14597]) Input IDs shape: torch.Size([1, 14597]) Labels shape: torch.Size([1, 14597]) Final batch size: 1, sequence length: 17166 Attention mask shape: torch.Size([1, 1, 17166, 17166]) Position ids shape: torch.Size([1, 17166]) Input IDs shape: torch.Size([1, 17166]) Labels shape: torch.Size([1, 17166]) Final batch size: 1, sequence length: 15438 Attention mask shape: torch.Size([1, 1, 15438, 15438]) Position ids shape: torch.Size([1, 15438]) Input IDs shape: torch.Size([1, 15438]) Labels shape: torch.Size([1, 15438]) Final batch size: 1, sequence length: 18050 Attention mask shape: torch.Size([1, 1, 18050, 18050]) Position ids shape: torch.Size([1, 18050]) Input IDs shape: torch.Size([1, 18050]) Labels shape: torch.Size([1, 18050]) Final batch size: 1, sequence length: 18264 Attention mask shape: torch.Size([1, 1, 18264, 18264]) Position ids shape: torch.Size([1, 18264]) Input IDs shape: torch.Size([1, 18264]) Labels shape: torch.Size([1, 18264]) Final batch size: 1, sequence length: 16014 Attention mask shape: torch.Size([1, 1, 16014, 16014]) Position ids shape: torch.Size([1, 16014]) Input IDs shape: torch.Size([1, 16014]) Labels shape: torch.Size([1, 16014]) Final batch size: 1, sequence length: 17092 Attention mask shape: torch.Size([1, 1, 17092, 17092]) Position ids shape: torch.Size([1, 17092]) Input IDs shape: torch.Size([1, 17092]) Labels shape: torch.Size([1, 17092]) Final batch size: 1, sequence length: 14520 Attention mask shape: torch.Size([1, 1, 14520, 14520]) Position ids shape: torch.Size([1, 14520]) Input IDs shape: torch.Size([1, 14520]) Labels shape: torch.Size([1, 14520]) Final batch size: 1, sequence length: 11206 Attention mask shape: torch.Size([1, 1, 11206, 11206]) Position ids shape: torch.Size([1, 11206]) Input IDs shape: torch.Size([1, 11206]) Labels shape: torch.Size([1, 11206]) Final batch size: 1, sequence length: 18131 Attention mask shape: torch.Size([1, 1, 18131, 18131]) Position ids shape: torch.Size([1, 18131]) Input IDs shape: torch.Size([1, 18131]) Labels shape: torch.Size([1, 18131]) Final batch size: 1, sequence length: 17194 Attention mask shape: torch.Size([1, 1, 17194, 17194]) Position ids shape: torch.Size([1, 17194]) Input IDs shape: torch.Size([1, 17194]) Labels shape: torch.Size([1, 17194]) Final batch size: 1, sequence length: 17839 Attention mask shape: torch.Size([1, 1, 17839, 17839]) Position ids shape: torch.Size([1, 17839]) Input IDs shape: torch.Size([1, 17839]) Labels shape: torch.Size([1, 17839]) Final batch size: 1, sequence length: 20947 Attention mask shape: torch.Size([1, 1, 20947, 20947]) Position ids shape: torch.Size([1, 20947]) Input IDs shape: torch.Size([1, 20947]) Labels shape: torch.Size([1, 20947]) Final batch size: 1, sequence length: 14335 Attention mask shape: torch.Size([1, 1, 14335, 14335]) Position ids shape: torch.Size([1, 14335]) Input IDs shape: torch.Size([1, 14335]) Labels shape: torch.Size([1, 14335]) Final batch size: 1, sequence length: 13061 Attention mask shape: torch.Size([1, 1, 13061, 13061]) Position ids shape: torch.Size([1, 13061]) Input IDs shape: torch.Size([1, 13061]) Labels shape: torch.Size([1, 13061]) Final batch size: 1, sequence length: 24566 Attention mask shape: torch.Size([1, 1, 24566, 24566]) Position ids shape: torch.Size([1, 24566]) Input IDs shape: torch.Size([1, 24566]) Labels shape: torch.Size([1, 24566]) Final batch size: 1, sequence length: 20487 Attention mask shape: torch.Size([1, 1, 20487, 20487]) Position ids shape: torch.Size([1, 20487]) Input IDs shape: torch.Size([1, 20487]) Labels shape: torch.Size([1, 20487]) Final batch size: 1, sequence length: 18950 Attention mask shape: torch.Size([1, 1, 18950, 18950]) Position ids shape: torch.Size([1, 18950]) Input IDs shape: torch.Size([1, 18950]) Labels shape: torch.Size([1, 18950]) Final batch size: 1, sequence length: 22107 Attention mask shape: torch.Size([1, 1, 22107, 22107]) Position ids shape: torch.Size([1, 22107]) Input IDs shape: torch.Size([1, 22107]) Labels shape: torch.Size([1, 22107]) Final batch size: 1, sequence length: 23324 Attention mask shape: torch.Size([1, 1, 23324, 23324]) Position ids shape: torch.Size([1, 23324]) Input IDs shape: torch.Size([1, 23324]) Labels shape: torch.Size([1, 23324]) Final batch size: 1, sequence length: 21137 Attention mask shape: torch.Size([1, 1, 21137, 21137]) Position ids shape: torch.Size([1, 21137]) Input IDs shape: torch.Size([1, 21137]) Labels shape: torch.Size([1, 21137]) Final batch size: 1, sequence length: 26068 Attention mask shape: torch.Size([1, 1, 26068, 26068]) Position ids shape: torch.Size([1, 26068]) Input IDs shape: torch.Size([1, 26068]) Labels shape: torch.Size([1, 26068]) Final batch size: 1, sequence length: 17115 Attention mask shape: torch.Size([1, 1, 17115, 17115]) Position ids shape: torch.Size([1, 17115]) Input IDs shape: torch.Size([1, 17115]) Labels shape: torch.Size([1, 17115]) Final batch size: 1, sequence length: 22547 Attention mask shape: torch.Size([1, 1, 22547, 22547]) Position ids shape: torch.Size([1, 22547]) Input IDs shape: torch.Size([1, 22547]) Labels shape: torch.Size([1, 22547]) Final batch size: 1, sequence length: 24432 Attention mask shape: torch.Size([1, 1, 24432, 24432]) Position ids shape: torch.Size([1, 24432]) Input IDs shape: torch.Size([1, 24432]) Labels shape: torch.Size([1, 24432]) Final batch size: 1, sequence length: 26271 Attention mask shape: torch.Size([1, 1, 26271, 26271]) Position ids shape: torch.Size([1, 26271]) Input IDs shape: torch.Size([1, 26271]) Labels shape: torch.Size([1, 26271]) Final batch size: 1, sequence length: 26937 Attention mask shape: torch.Size([1, 1, 26937, 26937]) Position ids shape: torch.Size([1, 26937]) Input IDs shape: torch.Size([1, 26937]) Labels shape: torch.Size([1, 26937]) Final batch size: 1, sequence length: 26333 Attention mask shape: torch.Size([1, 1, 26333, 26333]) Position ids shape: torch.Size([1, 26333]) Input IDs shape: torch.Size([1, 26333]) Labels shape: torch.Size([1, 26333]) Final batch size: 1, sequence length: 23960 Attention mask shape: torch.Size([1, 1, 23960, 23960]) Position ids shape: torch.Size([1, 23960]) Input IDs shape: torch.Size([1, 23960]) Labels shape: torch.Size([1, 23960]) Final batch size: 1, sequence length: 28284 Attention mask shape: torch.Size([1, 1, 28284, 28284]) Position ids shape: torch.Size([1, 28284]) Input IDs shape: torch.Size([1, 28284]) Labels shape: torch.Size([1, 28284]) Final batch size: 1, sequence length: 29236 Attention mask shape: torch.Size([1, 1, 29236, 29236]) Position ids shape: torch.Size([1, 29236]) Input IDs shape: torch.Size([1, 29236]) Labels shape: torch.Size([1, 29236]) Final batch size: 1, sequence length: 28858 Attention mask shape: torch.Size([1, 1, 28858, 28858]) Position ids shape: torch.Size([1, 28858]) Input IDs shape: torch.Size([1, 28858]) Labels shape: torch.Size([1, 28858]) Final batch size: 1, sequence length: 28421 Attention mask shape: torch.Size([1, 1, 28421, 28421]) Position ids shape: torch.Size([1, 28421]) Input IDs shape: torch.Size([1, 28421]) Labels shape: torch.Size([1, 28421]) Final batch size: 1, sequence length: 25252 Attention mask shape: torch.Size([1, 1, 25252, 25252]) Position ids shape: torch.Size([1, 25252]) Input IDs shape: torch.Size([1, 25252]) Labels shape: torch.Size([1, 25252]) Final batch size: 1, sequence length: 21728 Attention mask shape: torch.Size([1, 1, 21728, 21728]) Position ids shape: torch.Size([1, 21728]) Input IDs shape: torch.Size([1, 21728]) Labels shape: torch.Size([1, 21728]) Final batch size: 1, sequence length: 10286 Attention mask shape: torch.Size([1, 1, 10286, 10286]) Position ids shape: torch.Size([1, 10286]) Input IDs shape: torch.Size([1, 10286]) Labels shape: torch.Size([1, 10286]) Final batch size: 1, sequence length: 15229 Attention mask shape: torch.Size([1, 1, 15229, 15229]) Position ids shape: torch.Size([1, 15229]) Input IDs shape: torch.Size([1, 15229]) Labels shape: torch.Size([1, 15229]) Final batch size: 1, sequence length: 30862 Attention mask shape: torch.Size([1, 1, 30862, 30862]) Position ids shape: torch.Size([1, 30862]) Input IDs shape: torch.Size([1, 30862]) Labels shape: torch.Size([1, 30862]) Final batch size: 1, sequence length: 31740 Attention mask shape: torch.Size([1, 1, 31740, 31740]) Position ids shape: torch.Size([1, 31740]) Input IDs shape: torch.Size([1, 31740]) Labels shape: torch.Size([1, 31740]) Final batch size: 1, sequence length: 29478 Attention mask shape: torch.Size([1, 1, 29478, 29478]) Position ids shape: torch.Size([1, 29478]) Input IDs shape: torch.Size([1, 29478]) Labels shape: torch.Size([1, 29478]) Final batch size: 1, sequence length: 29768 Attention mask shape: torch.Size([1, 1, 29768, 29768]) Position ids shape: torch.Size([1, 29768]) Input IDs shape: torch.Size([1, 29768]) Labels shape: torch.Size([1, 29768]) Final batch size: 1, sequence length: 18051 Attention mask shape: torch.Size([1, 1, 18051, 18051]) Position ids shape: torch.Size([1, 18051]) Input IDs shape: torch.Size([1, 18051]) Labels shape: torch.Size([1, 18051]) Final batch size: 1, sequence length: 32392 Attention mask shape: torch.Size([1, 1, 32392, 32392]) Position ids shape: torch.Size([1, 32392]) Input IDs shape: torch.Size([1, 32392]) Labels shape: torch.Size([1, 32392]) Final batch size: 1, sequence length: 7364 Attention mask shape: torch.Size([1, 1, 7364, 7364]) Position ids shape: torch.Size([1, 7364]) Input IDs shape: torch.Size([1, 7364]) Labels shape: torch.Size([1, 7364]) Final batch size: 1, sequence length: 27293 Attention mask shape: torch.Size([1, 1, 27293, 27293]) Position ids shape: torch.Size([1, 27293]) Input IDs shape: torch.Size([1, 27293]) Labels shape: torch.Size([1, 27293]) Final batch size: 1, sequence length: 31377 Attention mask shape: torch.Size([1, 1, 31377, 31377]) Position ids shape: torch.Size([1, 31377]) Input IDs shape: torch.Size([1, 31377]) Labels shape: torch.Size([1, 31377]) Final batch size: 1, sequence length: 29592 Attention mask shape: torch.Size([1, 1, 29592, 29592]) Position ids shape: torch.Size([1, 29592]) Input IDs shape: torch.Size([1, 29592]) Labels shape: torch.Size([1, 29592]) Final batch size: 1, sequence length: 13257 Attention mask shape: torch.Size([1, 1, 13257, 13257]) Position ids shape: torch.Size([1, 13257]) Input IDs shape: torch.Size([1, 13257]) Labels shape: torch.Size([1, 13257]) Final batch size: 1, sequence length: 33097 Attention mask shape: torch.Size([1, 1, 33097, 33097]) Position ids shape: torch.Size([1, 33097]) Input IDs shape: torch.Size([1, 33097]) Labels shape: torch.Size([1, 33097]) Final batch size: 1, sequence length: 32112 Attention mask shape: torch.Size([1, 1, 32112, 32112]) Position ids shape: torch.Size([1, 32112]) Input IDs shape: torch.Size([1, 32112]) Labels shape: torch.Size([1, 32112]) Final batch size: 1, sequence length: 31860 Attention mask shape: torch.Size([1, 1, 31860, 31860]) Position ids shape: torch.Size([1, 31860]) Input IDs shape: torch.Size([1, 31860]) Labels shape: torch.Size([1, 31860]) Final batch size: 1, sequence length: 18408 Attention mask shape: torch.Size([1, 1, 18408, 18408]) Position ids shape: torch.Size([1, 18408]) Input IDs shape: torch.Size([1, 18408]) Labels shape: torch.Size([1, 18408]) Final batch size: 1, sequence length: 32181 Attention mask shape: torch.Size([1, 1, 32181, 32181]) Position ids shape: torch.Size([1, 32181]) Input IDs shape: torch.Size([1, 32181]) Labels shape: torch.Size([1, 32181]) Final batch size: 1, sequence length: 19888 Attention mask shape: torch.Size([1, 1, 19888, 19888]) Position ids shape: torch.Size([1, 19888]) Input IDs shape: torch.Size([1, 19888]) Labels shape: torch.Size([1, 19888]) Final batch size: 1, sequence length: 27278 Attention mask shape: torch.Size([1, 1, 27278, 27278]) Position ids shape: torch.Size([1, 27278]) Input IDs shape: torch.Size([1, 27278]) Labels shape: torch.Size([1, 27278]) Final batch size: 1, sequence length: 21500 Attention mask shape: torch.Size([1, 1, 21500, 21500]) Position ids shape: torch.Size([1, 21500]) Input IDs shape: torch.Size([1, 21500]) Labels shape: torch.Size([1, 21500]) Final batch size: 1, sequence length: 35171 Attention mask shape: torch.Size([1, 1, 35171, 35171]) Position ids shape: torch.Size([1, 35171]) Input IDs shape: torch.Size([1, 35171]) Labels shape: torch.Size([1, 35171]) Final batch size: 1, sequence length: 29489 Attention mask shape: torch.Size([1, 1, 29489, 29489]) Position ids shape: torch.Size([1, 29489]) Input IDs shape: torch.Size([1, 29489]) Labels shape: torch.Size([1, 29489]) Final batch size: 1, sequence length: 9974 Attention mask shape: torch.Size([1, 1, 9974, 9974]) Position ids shape: torch.Size([1, 9974]) Input IDs shape: torch.Size([1, 9974]) Labels shape: torch.Size([1, 9974]) Final batch size: 1, sequence length: 37623 Attention mask shape: torch.Size([1, 1, 37623, 37623]) Position ids shape: torch.Size([1, 37623]) Input IDs shape: torch.Size([1, 37623]) Labels shape: torch.Size([1, 37623]) Final batch size: 1, sequence length: 30302 Attention mask shape: torch.Size([1, 1, 30302, 30302]) Position ids shape: torch.Size([1, 30302]) Input IDs shape: torch.Size([1, 30302]) Labels shape: torch.Size([1, 30302]) Final batch size: 1, sequence length: 33736 Attention mask shape: torch.Size([1, 1, 33736, 33736]) Position ids shape: torch.Size([1, 33736]) Input IDs shape: torch.Size([1, 33736]) Labels shape: torch.Size([1, 33736]) Final batch size: 1, sequence length: 35596 Attention mask shape: torch.Size([1, 1, 35596, 35596]) Position ids shape: torch.Size([1, 35596]) Input IDs shape: torch.Size([1, 35596]) Labels shape: torch.Size([1, 35596]) Final batch size: 1, sequence length: 35069 Attention mask shape: torch.Size([1, 1, 35069, 35069]) Position ids shape: torch.Size([1, 35069]) Input IDs shape: torch.Size([1, 35069]) Labels shape: torch.Size([1, 35069]) Final batch size: 1, sequence length: 38351 Attention mask shape: torch.Size([1, 1, 38351, 38351]) Position ids shape: torch.Size([1, 38351]) Input IDs shape: torch.Size([1, 38351]) Labels shape: torch.Size([1, 38351]) Final batch size: 1, sequence length: 30051 Attention mask shape: torch.Size([1, 1, 30051, 30051]) Position ids shape: torch.Size([1, 30051]) Input IDs shape: torch.Size([1, 30051]) Labels shape: torch.Size([1, 30051]) Final batch size: 1, sequence length: 13557 Attention mask shape: torch.Size([1, 1, 13557, 13557]) Position ids shape: torch.Size([1, 13557]) Input IDs shape: torch.Size([1, 13557]) Labels shape: torch.Size([1, 13557]) Final batch size: 1, sequence length: 32162 Attention mask shape: torch.Size([1, 1, 32162, 32162]) Position ids shape: torch.Size([1, 32162]) Input IDs shape: torch.Size([1, 32162]) Labels shape: torch.Size([1, 32162]) Final batch size: 1, sequence length: 34871 Attention mask shape: torch.Size([1, 1, 34871, 34871]) Position ids shape: torch.Size([1, 34871]) Input IDs shape: torch.Size([1, 34871]) Labels shape: torch.Size([1, 34871]) Final batch size: 1, sequence length: 38790 Attention mask shape: torch.Size([1, 1, 38790, 38790]) Position ids shape: torch.Size([1, 38790]) Input IDs shape: torch.Size([1, 38790]) Labels shape: torch.Size([1, 38790]) Final batch size: 1, sequence length: 34514 Attention mask shape: torch.Size([1, 1, 34514, 34514]) Position ids shape: torch.Size([1, 34514]) Input IDs shape: torch.Size([1, 34514]) Labels shape: torch.Size([1, 34514]) Final batch size: 1, sequence length: 25484 Attention mask shape: torch.Size([1, 1, 25484, 25484]) Position ids shape: torch.Size([1, 25484]) Input IDs shape: torch.Size([1, 25484]) Labels shape: torch.Size([1, 25484]) Final batch size: 1, sequence length: 17777 Attention mask shape: torch.Size([1, 1, 17777, 17777]) Position ids shape: torch.Size([1, 17777]) Input IDs shape: torch.Size([1, 17777]) Labels shape: torch.Size([1, 17777]) Final batch size: 1, sequence length: 24061 Attention mask shape: torch.Size([1, 1, 24061, 24061]) Position ids shape: torch.Size([1, 24061]) Input IDs shape: torch.Size([1, 24061]) Labels shape: torch.Size([1, 24061]) Final batch size: 1, sequence length: 21408 Attention mask shape: torch.Size([1, 1, 21408, 21408]) Position ids shape: torch.Size([1, 21408]) Input IDs shape: torch.Size([1, 21408]) Labels shape: torch.Size([1, 21408]) Final batch size: 1, sequence length: 32979 Attention mask shape: torch.Size([1, 1, 32979, 32979]) Position ids shape: torch.Size([1, 32979]) Input IDs shape: torch.Size([1, 32979]) Labels shape: torch.Size([1, 32979]) Final batch size: 1, sequence length: 30639 Attention mask shape: torch.Size([1, 1, 30639, 30639]) Position ids shape: torch.Size([1, 30639]) Input IDs shape: torch.Size([1, 30639]) Labels shape: torch.Size([1, 30639]) Final batch size: 1, sequence length: 25386 Attention mask shape: torch.Size([1, 1, 25386, 25386]) Position ids shape: torch.Size([1, 25386]) Input IDs shape: torch.Size([1, 25386]) Labels shape: torch.Size([1, 25386]) Final batch size: 1, sequence length: 31679 Attention mask shape: torch.Size([1, 1, 31679, 31679]) Position ids shape: torch.Size([1, 31679]) Input IDs shape: torch.Size([1, 31679]) Labels shape: torch.Size([1, 31679]) Final batch size: 1, sequence length: 24943 Attention mask shape: torch.Size([1, 1, 24943, 24943]) Position ids shape: torch.Size([1, 24943]) Input IDs shape: torch.Size([1, 24943]) Labels shape: torch.Size([1, 24943]) Final batch size: 1, sequence length: 18711 Attention mask shape: torch.Size([1, 1, 18711, 18711]) Position ids shape: torch.Size([1, 18711]) Input IDs shape: torch.Size([1, 18711]) Labels shape: torch.Size([1, 18711]) Final batch size: 1, sequence length: 31513 Attention mask shape: torch.Size([1, 1, 31513, 31513]) Position ids shape: torch.Size([1, 31513]) Input IDs shape: torch.Size([1, 31513]) Labels shape: torch.Size([1, 31513]) Final batch size: 1, sequence length: 39754 Attention mask shape: torch.Size([1, 1, 39754, 39754]) Position ids shape: torch.Size([1, 39754]) Input IDs shape: torch.Size([1, 39754]) Labels shape: torch.Size([1, 39754]) Final batch size: 1, sequence length: 34170 Attention mask shape: torch.Size([1, 1, 34170, 34170]) Position ids shape: torch.Size([1, 34170]) Input IDs shape: torch.Size([1, 34170]) Labels shape: torch.Size([1, 34170]) Final batch size: 1, sequence length: 23243 Attention mask shape: torch.Size([1, 1, 23243, 23243]) Position ids shape: torch.Size([1, 23243]) Input IDs shape: torch.Size([1, 23243]) Labels shape: torch.Size([1, 23243]) Final batch size: 1, sequence length: 36535 Attention mask shape: torch.Size([1, 1, 36535, 36535]) Position ids shape: torch.Size([1, 36535]) Input IDs shape: torch.Size([1, 36535]) Labels shape: torch.Size([1, 36535]) Final batch size: 1, sequence length: 18106 Attention mask shape: torch.Size([1, 1, 18106, 18106]) Position ids shape: torch.Size([1, 18106]) Input IDs shape: torch.Size([1, 18106]) Labels shape: torch.Size([1, 18106]) Final batch size: 1, sequence length: 33807 Attention mask shape: torch.Size([1, 1, 33807, 33807]) Position ids shape: torch.Size([1, 33807]) Input IDs shape: torch.Size([1, 33807]) Labels shape: torch.Size([1, 33807]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 32327 Attention mask shape: torch.Size([1, 1, 32327, 32327]) Position ids shape: torch.Size([1, 32327]) Input IDs shape: torch.Size([1, 32327]) Labels shape: torch.Size([1, 32327]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 21825 Attention mask shape: torch.Size([1, 1, 21825, 21825]) Position ids shape: torch.Size([1, 21825]) Input IDs shape: torch.Size([1, 21825]) Labels shape: torch.Size([1, 21825]) Final batch size: 1, sequence length: 24840 Attention mask shape: torch.Size([1, 1, 24840, 24840]) Position ids shape: torch.Size([1, 24840]) Input IDs shape: torch.Size([1, 24840]) Labels shape: torch.Size([1, 24840]) Final batch size: 1, sequence length: 15697 Attention mask shape: torch.Size([1, 1, 15697, 15697]) Position ids shape: torch.Size([1, 15697]) Input IDs shape: torch.Size([1, 15697]) Labels shape: torch.Size([1, 15697]) Final batch size: 1, sequence length: 24910 Attention mask shape: torch.Size([1, 1, 24910, 24910]) Position ids shape: torch.Size([1, 24910]) Input IDs shape: torch.Size([1, 24910]) Labels shape: torch.Size([1, 24910]) Final batch size: 1, sequence length: 33883 Attention mask shape: torch.Size([1, 1, 33883, 33883]) Position ids shape: torch.Size([1, 33883]) Input IDs shape: torch.Size([1, 33883]) Labels shape: torch.Size([1, 33883]) Final batch size: 1, sequence length: 27228 Attention mask shape: torch.Size([1, 1, 27228, 27228]) Position ids shape: torch.Size([1, 27228]) Input IDs shape: torch.Size([1, 27228]) Labels shape: torch.Size([1, 27228]) Final batch size: 1, sequence length: 30361 Attention mask shape: torch.Size([1, 1, 30361, 30361]) Position ids shape: torch.Size([1, 30361]) Input IDs shape: torch.Size([1, 30361]) Labels shape: torch.Size([1, 30361]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 29691 Attention mask shape: torch.Size([1, 1, 29691, 29691]) Position ids shape: torch.Size([1, 29691]) Input IDs shape: torch.Size([1, 29691]) Labels shape: torch.Size([1, 29691]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 39483 Attention mask shape: torch.Size([1, 1, 39483, 39483]) Position ids shape: torch.Size([1, 39483]) Input IDs shape: torch.Size([1, 39483]) Labels shape: torch.Size([1, 39483]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 32351 Attention mask shape: torch.Size([1, 1, 32351, 32351]) Position ids shape: torch.Size([1, 32351]) Input IDs shape: torch.Size([1, 32351]) Labels shape: torch.Size([1, 32351]) Final batch size: 1, sequence length: 15180 Attention mask shape: torch.Size([1, 1, 15180, 15180]) Position ids shape: torch.Size([1, 15180]) Input IDs shape: torch.Size([1, 15180]) Labels shape: torch.Size([1, 15180]) Final batch size: 1, sequence length: 33549 Attention mask shape: torch.Size([1, 1, 33549, 33549]) Position ids shape: torch.Size([1, 33549]) Input IDs shape: torch.Size([1, 33549]) Labels shape: torch.Size([1, 33549]) Final batch size: 1, sequence length: 13064 Attention mask shape: torch.Size([1, 1, 13064, 13064]) Position ids shape: torch.Size([1, 13064]) Input IDs shape: torch.Size([1, 13064]) Labels shape: torch.Size([1, 13064]) Final batch size: 1, sequence length: 30419 Attention mask shape: torch.Size([1, 1, 30419, 30419]) Position ids shape: torch.Size([1, 30419]) Input IDs shape: torch.Size([1, 30419]) Labels shape: torch.Size([1, 30419]) Final batch size: 1, sequence length: 36532 Attention mask shape: torch.Size([1, 1, 36532, 36532]) Position ids shape: torch.Size([1, 36532]) Input IDs shape: torch.Size([1, 36532]) Labels shape: torch.Size([1, 36532]) {'loss': 0.3238, 'grad_norm': 0.6484353950985995, 'learning_rate': 9.45503262094184e-06, 'num_tokens': -inf, 'epoch': 1.75} Final batch size: 1, sequence length: 8845 Attention mask shape: torch.Size([1, 1, 8845, 8845]) Position ids shape: torch.Size([1, 8845]) Input IDs shape: torch.Size([1, 8845]) Labels shape: torch.Size([1, 8845]) Final batch size: 1, sequence length: 7235 Attention mask shape: torch.Size([1, 1, 7235, 7235]) Position ids shape: torch.Size([1, 7235]) Input IDs shape: torch.Size([1, 7235]) Labels shape: torch.Size([1, 7235]) Final batch size: 1, sequence length: 12215 Attention mask shape: torch.Size([1, 1, 12215, 12215]) Position ids shape: torch.Size([1, 12215]) Input IDs shape: torch.Size([1, 12215]) Labels shape: torch.Size([1, 12215]) Final batch size: 1, sequence length: 12830 Attention mask shape: torch.Size([1, 1, 12830, 12830]) Position ids shape: torch.Size([1, 12830]) Input IDs shape: torch.Size([1, 12830]) Labels shape: torch.Size([1, 12830]) Final batch size: 1, sequence length: 13575 Attention mask shape: torch.Size([1, 1, 13575, 13575]) Position ids shape: torch.Size([1, 13575]) Input IDs shape: torch.Size([1, 13575]) Labels shape: torch.Size([1, 13575]) Final batch size: 1, sequence length: 12562 Attention mask shape: torch.Size([1, 1, 12562, 12562]) Position ids shape: torch.Size([1, 12562]) Input IDs shape: torch.Size([1, 12562]) Labels shape: torch.Size([1, 12562]) Final batch size: 1, sequence length: 12960 Attention mask shape: torch.Size([1, 1, 12960, 12960]) Position ids shape: torch.Size([1, 12960]) Input IDs shape: torch.Size([1, 12960]) Labels shape: torch.Size([1, 12960]) Final batch size: 1, sequence length: 14482 Attention mask shape: torch.Size([1, 1, 14482, 14482]) Position ids shape: torch.Size([1, 14482]) Input IDs shape: torch.Size([1, 14482]) Labels shape: torch.Size([1, 14482]) Final batch size: 1, sequence length: 12622 Attention mask shape: torch.Size([1, 1, 12622, 12622]) Position ids shape: torch.Size([1, 12622]) Input IDs shape: torch.Size([1, 12622]) Labels shape: torch.Size([1, 12622]) Final batch size: 1, sequence length: 17016 Attention mask shape: torch.Size([1, 1, 17016, 17016]) Position ids shape: torch.Size([1, 17016]) Input IDs shape: torch.Size([1, 17016]) Labels shape: torch.Size([1, 17016]) Final batch size: 1, sequence length: 17026 Attention mask shape: torch.Size([1, 1, 17026, 17026]) Position ids shape: torch.Size([1, 17026]) Input IDs shape: torch.Size([1, 17026]) Labels shape: torch.Size([1, 17026]) Final batch size: 1, sequence length: 14833 Attention mask shape: torch.Size([1, 1, 14833, 14833]) Position ids shape: torch.Size([1, 14833]) Input IDs shape: torch.Size([1, 14833]) Labels shape: torch.Size([1, 14833]) Final batch size: 1, sequence length: 13665 Attention mask shape: torch.Size([1, 1, 13665, 13665]) Position ids shape: torch.Size([1, 13665]) Input IDs shape: torch.Size([1, 13665]) Labels shape: torch.Size([1, 13665]) Final batch size: 1, sequence length: 14689 Attention mask shape: torch.Size([1, 1, 14689, 14689]) Position ids shape: torch.Size([1, 14689]) Input IDs shape: torch.Size([1, 14689]) Labels shape: torch.Size([1, 14689]) Final batch size: 1, sequence length: 17512 Attention mask shape: torch.Size([1, 1, 17512, 17512]) Position ids shape: torch.Size([1, 17512]) Input IDs shape: torch.Size([1, 17512]) Labels shape: torch.Size([1, 17512]) Final batch size: 1, sequence length: 15816 Attention mask shape: torch.Size([1, 1, 15816, 15816]) Position ids shape: torch.Size([1, 15816]) Input IDs shape: torch.Size([1, 15816]) Labels shape: torch.Size([1, 15816]) Final batch size: 1, sequence length: 19409 Attention mask shape: torch.Size([1, 1, 19409, 19409]) Position ids shape: torch.Size([1, 19409]) Input IDs shape: torch.Size([1, 19409]) Labels shape: torch.Size([1, 19409]) Final batch size: 1, sequence length: 15673 Attention mask shape: torch.Size([1, 1, 15673, 15673]) Position ids shape: torch.Size([1, 15673]) Input IDs shape: torch.Size([1, 15673]) Labels shape: torch.Size([1, 15673]) Final batch size: 1, sequence length: 18978 Attention mask shape: torch.Size([1, 1, 18978, 18978]) Position ids shape: torch.Size([1, 18978]) Input IDs shape: torch.Size([1, 18978]) Labels shape: torch.Size([1, 18978]) Final batch size: 1, sequence length: 16145 Attention mask shape: torch.Size([1, 1, 16145, 16145]) Position ids shape: torch.Size([1, 16145]) Input IDs shape: torch.Size([1, 16145]) Labels shape: torch.Size([1, 16145]) Final batch size: 1, sequence length: 17988 Attention mask shape: torch.Size([1, 1, 17988, 17988]) Position ids shape: torch.Size([1, 17988]) Input IDs shape: torch.Size([1, 17988]) Labels shape: torch.Size([1, 17988]) Final batch size: 1, sequence length: 20630 Attention mask shape: torch.Size([1, 1, 20630, 20630]) Position ids shape: torch.Size([1, 20630]) Input IDs shape: torch.Size([1, 20630]) Labels shape: torch.Size([1, 20630]) Final batch size: 1, sequence length: 18922 Attention mask shape: torch.Size([1, 1, 18922, 18922]) Position ids shape: torch.Size([1, 18922]) Input IDs shape: torch.Size([1, 18922]) Labels shape: torch.Size([1, 18922]) Final batch size: 1, sequence length: 20784 Attention mask shape: torch.Size([1, 1, 20784, 20784]) Position ids shape: torch.Size([1, 20784]) Input IDs shape: torch.Size([1, 20784]) Labels shape: torch.Size([1, 20784]) Final batch size: 1, sequence length: 21075 Attention mask shape: torch.Size([1, 1, 21075, 21075]) Position ids shape: torch.Size([1, 21075]) Input IDs shape: torch.Size([1, 21075]) Labels shape: torch.Size([1, 21075]) Final batch size: 1, sequence length: 19325 Attention mask shape: torch.Size([1, 1, 19325, 19325]) Position ids shape: torch.Size([1, 19325]) Input IDs shape: torch.Size([1, 19325]) Labels shape: torch.Size([1, 19325]) Final batch size: 1, sequence length: 15222 Attention mask shape: torch.Size([1, 1, 15222, 15222]) Position ids shape: torch.Size([1, 15222]) Input IDs shape: torch.Size([1, 15222]) Labels shape: torch.Size([1, 15222]) Final batch size: 1, sequence length: 19225 Attention mask shape: torch.Size([1, 1, 19225, 19225]) Position ids shape: torch.Size([1, 19225]) Input IDs shape: torch.Size([1, 19225]) Labels shape: torch.Size([1, 19225]) Final batch size: 1, sequence length: 10494 Attention mask shape: torch.Size([1, 1, 10494, 10494]) Position ids shape: torch.Size([1, 10494]) Input IDs shape: torch.Size([1, 10494]) Labels shape: torch.Size([1, 10494]) Final batch size: 1, sequence length: 20991 Attention mask shape: torch.Size([1, 1, 20991, 20991]) Position ids shape: torch.Size([1, 20991]) Input IDs shape: torch.Size([1, 20991]) Labels shape: torch.Size([1, 20991]) Final batch size: 1, sequence length: 21598 Attention mask shape: torch.Size([1, 1, 21598, 21598]) Position ids shape: torch.Size([1, 21598]) Input IDs shape: torch.Size([1, 21598]) Labels shape: torch.Size([1, 21598]) Final batch size: 1, sequence length: 22949 Attention mask shape: torch.Size([1, 1, 22949, 22949]) Position ids shape: torch.Size([1, 22949]) Input IDs shape: torch.Size([1, 22949]) Labels shape: torch.Size([1, 22949]) Final batch size: 1, sequence length: 20585 Attention mask shape: torch.Size([1, 1, 20585, 20585]) Position ids shape: torch.Size([1, 20585]) Input IDs shape: torch.Size([1, 20585]) Labels shape: torch.Size([1, 20585]) Final batch size: 1, sequence length: 19724 Attention mask shape: torch.Size([1, 1, 19724, 19724]) Position ids shape: torch.Size([1, 19724]) Input IDs shape: torch.Size([1, 19724]) Labels shape: torch.Size([1, 19724]) Final batch size: 1, sequence length: 12969 Attention mask shape: torch.Size([1, 1, 12969, 12969]) Position ids shape: torch.Size([1, 12969]) Input IDs shape: torch.Size([1, 12969]) Labels shape: torch.Size([1, 12969]) Final batch size: 1, sequence length: 20389 Attention mask shape: torch.Size([1, 1, 20389, 20389]) Position ids shape: torch.Size([1, 20389]) Input IDs shape: torch.Size([1, 20389]) Labels shape: torch.Size([1, 20389]) Final batch size: 1, sequence length: 14577 Attention mask shape: torch.Size([1, 1, 14577, 14577]) Position ids shape: torch.Size([1, 14577]) Input IDs shape: torch.Size([1, 14577]) Labels shape: torch.Size([1, 14577]) Final batch size: 1, sequence length: 6618 Attention mask shape: torch.Size([1, 1, 6618, 6618]) Position ids shape: torch.Size([1, 6618]) Input IDs shape: torch.Size([1, 6618]) Labels shape: torch.Size([1, 6618]) Final batch size: 1, sequence length: 20465 Attention mask shape: torch.Size([1, 1, 20465, 20465]) Position ids shape: torch.Size([1, 20465]) Input IDs shape: torch.Size([1, 20465]) Labels shape: torch.Size([1, 20465]) Final batch size: 1, sequence length: 13072 Attention mask shape: torch.Size([1, 1, 13072, 13072]) Position ids shape: torch.Size([1, 13072]) Input IDs shape: torch.Size([1, 13072]) Labels shape: torch.Size([1, 13072]) Final batch size: 1, sequence length: 23293 Attention mask shape: torch.Size([1, 1, 23293, 23293]) Position ids shape: torch.Size([1, 23293]) Input IDs shape: torch.Size([1, 23293]) Labels shape: torch.Size([1, 23293]) Final batch size: 1, sequence length: 3010 Attention mask shape: torch.Size([1, 1, 3010, 3010]) Position ids shape: torch.Size([1, 3010]) Input IDs shape: torch.Size([1, 3010]) Labels shape: torch.Size([1, 3010]) Final batch size: 1, sequence length: 22778 Attention mask shape: torch.Size([1, 1, 22778, 22778]) Position ids shape: torch.Size([1, 22778]) Input IDs shape: torch.Size([1, 22778]) Labels shape: torch.Size([1, 22778]) Final batch size: 1, sequence length: 21907 Attention mask shape: torch.Size([1, 1, 21907, 21907]) Position ids shape: torch.Size([1, 21907]) Input IDs shape: torch.Size([1, 21907]) Labels shape: torch.Size([1, 21907]) Final batch size: 1, sequence length: 23437 Attention mask shape: torch.Size([1, 1, 23437, 23437]) Position ids shape: torch.Size([1, 23437]) Input IDs shape: torch.Size([1, 23437]) Labels shape: torch.Size([1, 23437]) Final batch size: 1, sequence length: 23805 Attention mask shape: torch.Size([1, 1, 23805, 23805]) Position ids shape: torch.Size([1, 23805]) Input IDs shape: torch.Size([1, 23805]) Labels shape: torch.Size([1, 23805]) Final batch size: 1, sequence length: 23755 Attention mask shape: torch.Size([1, 1, 23755, 23755]) Position ids shape: torch.Size([1, 23755]) Input IDs shape: torch.Size([1, 23755]) Labels shape: torch.Size([1, 23755]) Final batch size: 1, sequence length: 14548 Attention mask shape: torch.Size([1, 1, 14548, 14548]) Position ids shape: torch.Size([1, 14548]) Input IDs shape: torch.Size([1, 14548]) Labels shape: torch.Size([1, 14548]) Final batch size: 1, sequence length: 23442 Attention mask shape: torch.Size([1, 1, 23442, 23442]) Position ids shape: torch.Size([1, 23442]) Input IDs shape: torch.Size([1, 23442]) Labels shape: torch.Size([1, 23442]) Final batch size: 1, sequence length: 26500 Attention mask shape: torch.Size([1, 1, 26500, 26500]) Position ids shape: torch.Size([1, 26500]) Input IDs shape: torch.Size([1, 26500]) Labels shape: torch.Size([1, 26500]) Final batch size: 1, sequence length: 26424 Attention mask shape: torch.Size([1, 1, 26424, 26424]) Position ids shape: torch.Size([1, 26424]) Input IDs shape: torch.Size([1, 26424]) Labels shape: torch.Size([1, 26424]) Final batch size: 1, sequence length: 27520 Attention mask shape: torch.Size([1, 1, 27520, 27520]) Position ids shape: torch.Size([1, 27520]) Input IDs shape: torch.Size([1, 27520]) Labels shape: torch.Size([1, 27520]) Final batch size: 1, sequence length: 21853 Attention mask shape: torch.Size([1, 1, 21853, 21853]) Position ids shape: torch.Size([1, 21853]) Input IDs shape: torch.Size([1, 21853]) Labels shape: torch.Size([1, 21853]) Final batch size: 1, sequence length: 11662 Attention mask shape: torch.Size([1, 1, 11662, 11662]) Position ids shape: torch.Size([1, 11662]) Input IDs shape: torch.Size([1, 11662]) Labels shape: torch.Size([1, 11662]) Final batch size: 1, sequence length: 25934 Attention mask shape: torch.Size([1, 1, 25934, 25934]) Position ids shape: torch.Size([1, 25934]) Input IDs shape: torch.Size([1, 25934]) Labels shape: torch.Size([1, 25934]) Final batch size: 1, sequence length: 28474 Attention mask shape: torch.Size([1, 1, 28474, 28474]) Position ids shape: torch.Size([1, 28474]) Input IDs shape: torch.Size([1, 28474]) Labels shape: torch.Size([1, 28474]) Final batch size: 1, sequence length: 14137 Attention mask shape: torch.Size([1, 1, 14137, 14137]) Position ids shape: torch.Size([1, 14137]) Input IDs shape: torch.Size([1, 14137]) Labels shape: torch.Size([1, 14137]) Final batch size: 1, sequence length: 27384 Attention mask shape: torch.Size([1, 1, 27384, 27384]) Position ids shape: torch.Size([1, 27384]) Input IDs shape: torch.Size([1, 27384]) Labels shape: torch.Size([1, 27384]) Final batch size: 1, sequence length: 28012 Attention mask shape: torch.Size([1, 1, 28012, 28012]) Position ids shape: torch.Size([1, 28012]) Input IDs shape: torch.Size([1, 28012]) Labels shape: torch.Size([1, 28012]) Final batch size: 1, sequence length: 26454 Attention mask shape: torch.Size([1, 1, 26454, 26454]) Position ids shape: torch.Size([1, 26454]) Input IDs shape: torch.Size([1, 26454]) Labels shape: torch.Size([1, 26454]) Final batch size: 1, sequence length: 22851 Attention mask shape: torch.Size([1, 1, 22851, 22851]) Position ids shape: torch.Size([1, 22851]) Input IDs shape: torch.Size([1, 22851]) Labels shape: torch.Size([1, 22851]) Final batch size: 1, sequence length: 21322 Attention mask shape: torch.Size([1, 1, 21322, 21322]) Position ids shape: torch.Size([1, 21322]) Input IDs shape: torch.Size([1, 21322]) Labels shape: torch.Size([1, 21322]) Final batch size: 1, sequence length: 24111 Attention mask shape: torch.Size([1, 1, 24111, 24111]) Position ids shape: torch.Size([1, 24111]) Input IDs shape: torch.Size([1, 24111]) Labels shape: torch.Size([1, 24111]) Final batch size: 1, sequence length: 19027 Attention mask shape: torch.Size([1, 1, 19027, 19027]) Position ids shape: torch.Size([1, 19027]) Input IDs shape: torch.Size([1, 19027]) Labels shape: torch.Size([1, 19027]) Final batch size: 1, sequence length: 20314 Attention mask shape: torch.Size([1, 1, 20314, 20314]) Position ids shape: torch.Size([1, 20314]) Input IDs shape: torch.Size([1, 20314]) Labels shape: torch.Size([1, 20314]) Final batch size: 1, sequence length: 30235 Attention mask shape: torch.Size([1, 1, 30235, 30235]) Position ids shape: torch.Size([1, 30235]) Input IDs shape: torch.Size([1, 30235]) Labels shape: torch.Size([1, 30235]) Final batch size: 1, sequence length: 24107 Attention mask shape: torch.Size([1, 1, 24107, 24107]) Position ids shape: torch.Size([1, 24107]) Input IDs shape: torch.Size([1, 24107]) Labels shape: torch.Size([1, 24107]) Final batch size: 1, sequence length: 22920 Attention mask shape: torch.Size([1, 1, 22920, 22920]) Position ids shape: torch.Size([1, 22920]) Input IDs shape: torch.Size([1, 22920]) Labels shape: torch.Size([1, 22920]) Final batch size: 1, sequence length: 24364 Attention mask shape: torch.Size([1, 1, 24364, 24364]) Position ids shape: torch.Size([1, 24364]) Input IDs shape: torch.Size([1, 24364]) Labels shape: torch.Size([1, 24364]) Final batch size: 1, sequence length: 20028 Attention mask shape: torch.Size([1, 1, 20028, 20028]) Position ids shape: torch.Size([1, 20028]) Input IDs shape: torch.Size([1, 20028]) Labels shape: torch.Size([1, 20028]) Final batch size: 1, sequence length: 29842 Attention mask shape: torch.Size([1, 1, 29842, 29842]) Position ids shape: torch.Size([1, 29842]) Input IDs shape: torch.Size([1, 29842]) Labels shape: torch.Size([1, 29842]) Final batch size: 1, sequence length: 31404 Attention mask shape: torch.Size([1, 1, 31404, 31404]) Position ids shape: torch.Size([1, 31404]) Input IDs shape: torch.Size([1, 31404]) Labels shape: torch.Size([1, 31404]) Final batch size: 1, sequence length: 12234 Attention mask shape: torch.Size([1, 1, 12234, 12234]) Position ids shape: torch.Size([1, 12234]) Input IDs shape: torch.Size([1, 12234]) Labels shape: torch.Size([1, 12234]) Final batch size: 1, sequence length: 12519 Attention mask shape: torch.Size([1, 1, 12519, 12519]) Position ids shape: torch.Size([1, 12519]) Input IDs shape: torch.Size([1, 12519]) Labels shape: torch.Size([1, 12519]) Final batch size: 1, sequence length: 24716 Attention mask shape: torch.Size([1, 1, 24716, 24716]) Final batch size: 1, sequence length: 34169 Position ids shape: torch.Size([1, 24716]) Attention mask shape: torch.Size([1, 1, 34169, 34169]) Position ids shape: torch.Size([1, 34169]) Input IDs shape: torch.Size([1, 24716]) Input IDs shape: torch.Size([1, 34169]) Labels shape: torch.Size([1, 24716]) Labels shape: torch.Size([1, 34169]) Final batch size: 1, sequence length: 23855 Final batch size: 1, sequence length: 23695 Attention mask shape: torch.Size([1, 1, 23855, 23855]) Attention mask shape: torch.Size([1, 1, 23695, 23695]) Position ids shape: torch.Size([1, 23695]) Input IDs shape: torch.Size([1, 23695]) Position ids shape: torch.Size([1, 23855]) Labels shape: torch.Size([1, 23695]) Input IDs shape: torch.Size([1, 23855]) Labels shape: torch.Size([1, 23855]) Final batch size: 1, sequence length: 33765 Attention mask shape: torch.Size([1, 1, 33765, 33765]) Position ids shape: torch.Size([1, 33765]) Input IDs shape: torch.Size([1, 33765]) Labels shape: torch.Size([1, 33765]) Final batch size: 1, sequence length: 24923 Attention mask shape: torch.Size([1, 1, 24923, 24923]) Position ids shape: torch.Size([1, 24923]) Input IDs shape: torch.Size([1, 24923]) Labels shape: torch.Size([1, 24923]) Final batch size: 1, sequence length: 27107 Attention mask shape: torch.Size([1, 1, 27107, 27107]) Position ids shape: torch.Size([1, 27107]) Input IDs shape: torch.Size([1, 27107]) Labels shape: torch.Size([1, 27107]) Final batch size: 1, sequence length: 21168 Attention mask shape: torch.Size([1, 1, 21168, 21168]) Position ids shape: torch.Size([1, 21168]) Input IDs shape: torch.Size([1, 21168]) Labels shape: torch.Size([1, 21168]) Final batch size: 1, sequence length: 30041 Attention mask shape: torch.Size([1, 1, 30041, 30041]) Position ids shape: torch.Size([1, 30041]) Input IDs shape: torch.Size([1, 30041]) Labels shape: torch.Size([1, 30041]) Final batch size: 1, sequence length: 24830 Attention mask shape: torch.Size([1, 1, 24830, 24830]) Position ids shape: torch.Size([1, 24830]) Input IDs shape: torch.Size([1, 24830]) Labels shape: torch.Size([1, 24830]) Final batch size: 1, sequence length: 33179 Attention mask shape: torch.Size([1, 1, 33179, 33179]) Position ids shape: torch.Size([1, 33179]) Input IDs shape: torch.Size([1, 33179]) Labels shape: torch.Size([1, 33179]) Final batch size: 1, sequence length: 17763 Attention mask shape: torch.Size([1, 1, 17763, 17763]) Position ids shape: torch.Size([1, 17763]) Input IDs shape: torch.Size([1, 17763]) Labels shape: torch.Size([1, 17763]) Final batch size: 1, sequence length: 10139 Attention mask shape: torch.Size([1, 1, 10139, 10139]) Position ids shape: torch.Size([1, 10139]) Input IDs shape: torch.Size([1, 10139]) Labels shape: torch.Size([1, 10139]) Final batch size: 1, sequence length: 29385 Attention mask shape: torch.Size([1, 1, 29385, 29385]) Position ids shape: torch.Size([1, 29385]) Input IDs shape: torch.Size([1, 29385]) Labels shape: torch.Size([1, 29385]) Final batch size: 1, sequence length: 38576 Attention mask shape: torch.Size([1, 1, 38576, 38576]) Position ids shape: torch.Size([1, 38576]) Input IDs shape: torch.Size([1, 38576]) Labels shape: torch.Size([1, 38576]) Final batch size: 1, sequence length: 37657 Attention mask shape: torch.Size([1, 1, 37657, 37657]) Position ids shape: torch.Size([1, 37657]) Input IDs shape: torch.Size([1, 37657]) Labels shape: torch.Size([1, 37657]) Final batch size: 1, sequence length: 24187 Attention mask shape: torch.Size([1, 1, 24187, 24187]) Position ids shape: torch.Size([1, 24187]) Input IDs shape: torch.Size([1, 24187]) Labels shape: torch.Size([1, 24187]) Final batch size: 1, sequence length: 33321 Attention mask shape: torch.Size([1, 1, 33321, 33321]) Position ids shape: torch.Size([1, 33321]) Input IDs shape: torch.Size([1, 33321]) Labels shape: torch.Size([1, 33321]) Final batch size: 1, sequence length: 19302 Attention mask shape: torch.Size([1, 1, 19302, 19302]) Position ids shape: torch.Size([1, 19302]) Input IDs shape: torch.Size([1, 19302]) Labels shape: torch.Size([1, 19302]) Final batch size: 1, sequence length: 29743 Attention mask shape: torch.Size([1, 1, 29743, 29743]) Position ids shape: torch.Size([1, 29743]) Input IDs shape: torch.Size([1, 29743]) Labels shape: torch.Size([1, 29743]) Final batch size: 1, sequence length: 40006 Attention mask shape: torch.Size([1, 1, 40006, 40006]) Position ids shape: torch.Size([1, 40006]) Input IDs shape: torch.Size([1, 40006]) Labels shape: torch.Size([1, 40006]) Final batch size: 1, sequence length: 26696 Attention mask shape: torch.Size([1, 1, 26696, 26696]) Position ids shape: torch.Size([1, 26696]) Input IDs shape: torch.Size([1, 26696]) Labels shape: torch.Size([1, 26696]) Final batch size: 1, sequence length: 35555 Attention mask shape: torch.Size([1, 1, 35555, 35555]) Position ids shape: torch.Size([1, 35555]) Input IDs shape: torch.Size([1, 35555]) Labels shape: torch.Size([1, 35555]) Final batch size: 1, sequence length: 27738 Attention mask shape: torch.Size([1, 1, 27738, 27738]) Position ids shape: torch.Size([1, 27738]) Input IDs shape: torch.Size([1, 27738]) Labels shape: torch.Size([1, 27738]) Final batch size: 1, sequence length: 32217 Attention mask shape: torch.Size([1, 1, 32217, 32217]) Position ids shape: torch.Size([1, 32217]) Input IDs shape: torch.Size([1, 32217]) Labels shape: torch.Size([1, 32217]) Final batch size: 1, sequence length: 18734 Attention mask shape: torch.Size([1, 1, 18734, 18734]) Position ids shape: torch.Size([1, 18734]) Input IDs shape: torch.Size([1, 18734]) Labels shape: torch.Size([1, 18734]) Final batch size: 1, sequence length: 26685 Attention mask shape: torch.Size([1, 1, 26685, 26685]) Position ids shape: torch.Size([1, 26685]) Input IDs shape: torch.Size([1, 26685]) Labels shape: torch.Size([1, 26685]) Final batch size: 1, sequence length: 29085 Attention mask shape: torch.Size([1, 1, 29085, 29085]) Position ids shape: torch.Size([1, 29085]) Input IDs shape: torch.Size([1, 29085]) Labels shape: torch.Size([1, 29085]) Final batch size: 1, sequence length: 29286 Attention mask shape: torch.Size([1, 1, 29286, 29286]) Position ids shape: torch.Size([1, 29286]) Input IDs shape: torch.Size([1, 29286]) Labels shape: torch.Size([1, 29286]) Final batch size: 1, sequence length: 23848 Attention mask shape: torch.Size([1, 1, 23848, 23848]) Position ids shape: torch.Size([1, 23848]) Input IDs shape: torch.Size([1, 23848]) Labels shape: torch.Size([1, 23848]) Final batch size: 1, sequence length: 35949 Attention mask shape: torch.Size([1, 1, 35949, 35949]) Position ids shape: torch.Size([1, 35949]) Input IDs shape: torch.Size([1, 35949]) Labels shape: torch.Size([1, 35949]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 31991 Attention mask shape: torch.Size([1, 1, 31991, 31991]) Position ids shape: torch.Size([1, 31991]) Input IDs shape: torch.Size([1, 31991]) Labels shape: torch.Size([1, 31991]) Final batch size: 1, sequence length: 39018 Attention mask shape: torch.Size([1, 1, 39018, 39018]) Position ids shape: torch.Size([1, 39018]) Input IDs shape: torch.Size([1, 39018]) Labels shape: torch.Size([1, 39018]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 31115 Attention mask shape: torch.Size([1, 1, 31115, 31115]) Position ids shape: torch.Size([1, 31115]) Input IDs shape: torch.Size([1, 31115]) Labels shape: torch.Size([1, 31115]) Final batch size: 1, sequence length: 22500 Attention mask shape: torch.Size([1, 1, 22500, 22500]) Position ids shape: torch.Size([1, 22500]) Input IDs shape: torch.Size([1, 22500]) Labels shape: torch.Size([1, 22500]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 19138 Attention mask shape: torch.Size([1, 1, 19138, 19138]) Position ids shape: torch.Size([1, 19138]) Input IDs shape: torch.Size([1, 19138]) Labels shape: torch.Size([1, 19138]) Final batch size: 1, sequence length: 34628 Attention mask shape: torch.Size([1, 1, 34628, 34628]) Position ids shape: torch.Size([1, 34628]) Input IDs shape: torch.Size([1, 34628]) Labels shape: torch.Size([1, 34628]) Final batch size: 1, sequence length: 25990 Attention mask shape: torch.Size([1, 1, 25990, 25990]) Position ids shape: torch.Size([1, 25990]) Input IDs shape: torch.Size([1, 25990]) Labels shape: torch.Size([1, 25990]) Final batch size: 1, sequence length: 40834 Attention mask shape: torch.Size([1, 1, 40834, 40834]) Position ids shape: torch.Size([1, 40834]) Input IDs shape: torch.Size([1, 40834]) Labels shape: torch.Size([1, 40834]) Final batch size: 1, sequence length: 37680 Attention mask shape: torch.Size([1, 1, 37680, 37680]) Position ids shape: torch.Size([1, 37680]) Input IDs shape: torch.Size([1, 37680]) Labels shape: torch.Size([1, 37680]) Final batch size: 1, sequence length: 32762 Attention mask shape: torch.Size([1, 1, 32762, 32762]) Position ids shape: torch.Size([1, 32762]) Input IDs shape: torch.Size([1, 32762]) Labels shape: torch.Size([1, 32762]) Final batch size: 1, sequence length: 22778 Attention mask shape: torch.Size([1, 1, 22778, 22778]) Position ids shape: torch.Size([1, 22778]) Input IDs shape: torch.Size([1, 22778]) Labels shape: torch.Size([1, 22778]) Final batch size: 1, sequence length: 32675 Attention mask shape: torch.Size([1, 1, 32675, 32675]) Position ids shape: torch.Size([1, 32675]) Input IDs shape: torch.Size([1, 32675]) Labels shape: torch.Size([1, 32675]) Final batch size: 1, sequence length: 26372 Attention mask shape: torch.Size([1, 1, 26372, 26372]) Position ids shape: torch.Size([1, 26372]) Input IDs shape: torch.Size([1, 26372]) Labels shape: torch.Size([1, 26372]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 34791 Attention mask shape: torch.Size([1, 1, 34791, 34791]) Position ids shape: torch.Size([1, 34791]) Input IDs shape: torch.Size([1, 34791]) Labels shape: torch.Size([1, 34791]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) {'loss': 0.3133, 'grad_norm': 0.5353261587136433, 'learning_rate': 9.330127018922195e-06, 'num_tokens': -inf, 'epoch': 1.88} Final batch size: 1, sequence length: 5686 Attention mask shape: torch.Size([1, 1, 5686, 5686]) Position ids shape: torch.Size([1, 5686]) Input IDs shape: torch.Size([1, 5686]) Labels shape: torch.Size([1, 5686]) Final batch size: 1, sequence length: 6986 Attention mask shape: torch.Size([1, 1, 6986, 6986]) Position ids shape: torch.Size([1, 6986]) Input IDs shape: torch.Size([1, 6986]) Labels shape: torch.Size([1, 6986]) Final batch size: 1, sequence length: 12847 Attention mask shape: torch.Size([1, 1, 12847, 12847]) Position ids shape: torch.Size([1, 12847]) Input IDs shape: torch.Size([1, 12847]) Labels shape: torch.Size([1, 12847]) Final batch size: 1, sequence length: 11304 Attention mask shape: torch.Size([1, 1, 11304, 11304]) Position ids shape: torch.Size([1, 11304]) Input IDs shape: torch.Size([1, 11304]) Labels shape: torch.Size([1, 11304]) Final batch size: 1, sequence length: 13872 Attention mask shape: torch.Size([1, 1, 13872, 13872]) Position ids shape: torch.Size([1, 13872]) Input IDs shape: torch.Size([1, 13872]) Labels shape: torch.Size([1, 13872]) Final batch size: 1, sequence length: 14304 Attention mask shape: torch.Size([1, 1, 14304, 14304]) Position ids shape: torch.Size([1, 14304]) Input IDs shape: torch.Size([1, 14304]) Labels shape: torch.Size([1, 14304]) Final batch size: 1, sequence length: 13224 Attention mask shape: torch.Size([1, 1, 13224, 13224]) Position ids shape: torch.Size([1, 13224]) Input IDs shape: torch.Size([1, 13224]) Labels shape: torch.Size([1, 13224]) Final batch size: 1, sequence length: 15621 Attention mask shape: torch.Size([1, 1, 15621, 15621]) Position ids shape: torch.Size([1, 15621]) Input IDs shape: torch.Size([1, 15621]) Labels shape: torch.Size([1, 15621]) Final batch size: 1, sequence length: 15652 Attention mask shape: torch.Size([1, 1, 15652, 15652]) Position ids shape: torch.Size([1, 15652]) Input IDs shape: torch.Size([1, 15652]) Labels shape: torch.Size([1, 15652]) Final batch size: 1, sequence length: 16440 Attention mask shape: torch.Size([1, 1, 16440, 16440]) Position ids shape: torch.Size([1, 16440]) Input IDs shape: torch.Size([1, 16440]) Labels shape: torch.Size([1, 16440]) Final batch size: 1, sequence length: 14168 Attention mask shape: torch.Size([1, 1, 14168, 14168]) Position ids shape: torch.Size([1, 14168]) Input IDs shape: torch.Size([1, 14168]) Labels shape: torch.Size([1, 14168]) Final batch size: 1, sequence length: 18246 Attention mask shape: torch.Size([1, 1, 18246, 18246]) Position ids shape: torch.Size([1, 18246]) Input IDs shape: torch.Size([1, 18246]) Labels shape: torch.Size([1, 18246]) Final batch size: 1, sequence length: 16514 Attention mask shape: torch.Size([1, 1, 16514, 16514]) Position ids shape: torch.Size([1, 16514]) Input IDs shape: torch.Size([1, 16514]) Labels shape: torch.Size([1, 16514]) Final batch size: 1, sequence length: 17773 Attention mask shape: torch.Size([1, 1, 17773, 17773]) Position ids shape: torch.Size([1, 17773]) Input IDs shape: torch.Size([1, 17773]) Labels shape: torch.Size([1, 17773]) Final batch size: 1, sequence length: 17213 Attention mask shape: torch.Size([1, 1, 17213, 17213]) Position ids shape: torch.Size([1, 17213]) Input IDs shape: torch.Size([1, 17213]) Labels shape: torch.Size([1, 17213]) Final batch size: 1, sequence length: 18044 Attention mask shape: torch.Size([1, 1, 18044, 18044]) Position ids shape: torch.Size([1, 18044]) Input IDs shape: torch.Size([1, 18044]) Labels shape: torch.Size([1, 18044]) Final batch size: 1, sequence length: 16501 Attention mask shape: torch.Size([1, 1, 16501, 16501]) Position ids shape: torch.Size([1, 16501]) Input IDs shape: torch.Size([1, 16501]) Labels shape: torch.Size([1, 16501]) Final batch size: 1, sequence length: 17852 Attention mask shape: torch.Size([1, 1, 17852, 17852]) Position ids shape: torch.Size([1, 17852]) Input IDs shape: torch.Size([1, 17852]) Labels shape: torch.Size([1, 17852]) Final batch size: 1, sequence length: 16378 Attention mask shape: torch.Size([1, 1, 16378, 16378]) Position ids shape: torch.Size([1, 16378]) Input IDs shape: torch.Size([1, 16378]) Labels shape: torch.Size([1, 16378]) Final batch size: 1, sequence length: 18289 Attention mask shape: torch.Size([1, 1, 18289, 18289]) Position ids shape: torch.Size([1, 18289]) Input IDs shape: torch.Size([1, 18289]) Labels shape: torch.Size([1, 18289]) Final batch size: 1, sequence length: 21207 Attention mask shape: torch.Size([1, 1, 21207, 21207]) Position ids shape: torch.Size([1, 21207]) Input IDs shape: torch.Size([1, 21207]) Labels shape: torch.Size([1, 21207]) Final batch size: 1, sequence length: 20766 Attention mask shape: torch.Size([1, 1, 20766, 20766]) Position ids shape: torch.Size([1, 20766]) Input IDs shape: torch.Size([1, 20766]) Labels shape: torch.Size([1, 20766]) Final batch size: 1, sequence length: 21163 Attention mask shape: torch.Size([1, 1, 21163, 21163]) Position ids shape: torch.Size([1, 21163]) Input IDs shape: torch.Size([1, 21163]) Labels shape: torch.Size([1, 21163]) Final batch size: 1, sequence length: 18155 Attention mask shape: torch.Size([1, 1, 18155, 18155]) Position ids shape: torch.Size([1, 18155]) Input IDs shape: torch.Size([1, 18155]) Labels shape: torch.Size([1, 18155]) Final batch size: 1, sequence length: 22784 Attention mask shape: torch.Size([1, 1, 22784, 22784]) Position ids shape: torch.Size([1, 22784]) Input IDs shape: torch.Size([1, 22784]) Labels shape: torch.Size([1, 22784]) Final batch size: 1, sequence length: 20813 Attention mask shape: torch.Size([1, 1, 20813, 20813]) Position ids shape: torch.Size([1, 20813]) Input IDs shape: torch.Size([1, 20813]) Labels shape: torch.Size([1, 20813]) Final batch size: 1, sequence length: 20176 Attention mask shape: torch.Size([1, 1, 20176, 20176]) Position ids shape: torch.Size([1, 20176]) Input IDs shape: torch.Size([1, 20176]) Labels shape: torch.Size([1, 20176]) Final batch size: 1, sequence length: 22152 Attention mask shape: torch.Size([1, 1, 22152, 22152]) Position ids shape: torch.Size([1, 22152]) Input IDs shape: torch.Size([1, 22152]) Labels shape: torch.Size([1, 22152]) Final batch size: 1, sequence length: 21316 Attention mask shape: torch.Size([1, 1, 21316, 21316]) Position ids shape: torch.Size([1, 21316]) Input IDs shape: torch.Size([1, 21316]) Labels shape: torch.Size([1, 21316]) Final batch size: 1, sequence length: 21607 Attention mask shape: torch.Size([1, 1, 21607, 21607]) Position ids shape: torch.Size([1, 21607]) Input IDs shape: torch.Size([1, 21607]) Labels shape: torch.Size([1, 21607]) Final batch size: 1, sequence length: 21562 Attention mask shape: torch.Size([1, 1, 21562, 21562]) Position ids shape: torch.Size([1, 21562]) Input IDs shape: torch.Size([1, 21562]) Labels shape: torch.Size([1, 21562]) Final batch size: 1, sequence length: 22197 Attention mask shape: torch.Size([1, 1, 22197, 22197]) Position ids shape: torch.Size([1, 22197]) Input IDs shape: torch.Size([1, 22197]) Labels shape: torch.Size([1, 22197]) Final batch size: 1, sequence length: 25880 Attention mask shape: torch.Size([1, 1, 25880, 25880]) Position ids shape: torch.Size([1, 25880]) Input IDs shape: torch.Size([1, 25880]) Labels shape: torch.Size([1, 25880]) Final batch size: 1, sequence length: 21705 Attention mask shape: torch.Size([1, 1, 21705, 21705]) Position ids shape: torch.Size([1, 21705]) Input IDs shape: torch.Size([1, 21705]) Labels shape: torch.Size([1, 21705]) Final batch size: 1, sequence length: 25053 Attention mask shape: torch.Size([1, 1, 25053, 25053]) Position ids shape: torch.Size([1, 25053]) Input IDs shape: torch.Size([1, 25053]) Labels shape: torch.Size([1, 25053]) Final batch size: 1, sequence length: 22117 Attention mask shape: torch.Size([1, 1, 22117, 22117]) Position ids shape: torch.Size([1, 22117]) Input IDs shape: torch.Size([1, 22117]) Labels shape: torch.Size([1, 22117]) Final batch size: 1, sequence length: 26494 Attention mask shape: torch.Size([1, 1, 26494, 26494]) Position ids shape: torch.Size([1, 26494]) Input IDs shape: torch.Size([1, 26494]) Labels shape: torch.Size([1, 26494]) Final batch size: 1, sequence length: 26921 Attention mask shape: torch.Size([1, 1, 26921, 26921]) Position ids shape: torch.Size([1, 26921]) Input IDs shape: torch.Size([1, 26921]) Labels shape: torch.Size([1, 26921]) Final batch size: 1, sequence length: 27005 Attention mask shape: torch.Size([1, 1, 27005, 27005]) Position ids shape: torch.Size([1, 27005]) Input IDs shape: torch.Size([1, 27005]) Labels shape: torch.Size([1, 27005]) Final batch size: 1, sequence length: 26303 Attention mask shape: torch.Size([1, 1, 26303, 26303]) Position ids shape: torch.Size([1, 26303]) Input IDs shape: torch.Size([1, 26303]) Labels shape: torch.Size([1, 26303]) Final batch size: 1, sequence length: 28523 Attention mask shape: torch.Size([1, 1, 28523, 28523]) Position ids shape: torch.Size([1, 28523]) Input IDs shape: torch.Size([1, 28523]) Labels shape: torch.Size([1, 28523]) Final batch size: 1, sequence length: 30592 Attention mask shape: torch.Size([1, 1, 30592, 30592]) Position ids shape: torch.Size([1, 30592]) Input IDs shape: torch.Size([1, 30592]) Labels shape: torch.Size([1, 30592]) Final batch size: 1, sequence length: 30151 Attention mask shape: torch.Size([1, 1, 30151, 30151]) Position ids shape: torch.Size([1, 30151]) Input IDs shape: torch.Size([1, 30151]) Labels shape: torch.Size([1, 30151]) Final batch size: 1, sequence length: 28897 Attention mask shape: torch.Size([1, 1, 28897, 28897]) Position ids shape: torch.Size([1, 28897]) Input IDs shape: torch.Size([1, 28897]) Labels shape: torch.Size([1, 28897]) Final batch size: 1, sequence length: 32473 Attention mask shape: torch.Size([1, 1, 32473, 32473]) Position ids shape: torch.Size([1, 32473]) Input IDs shape: torch.Size([1, 32473]) Labels shape: torch.Size([1, 32473]) Final batch size: 1, sequence length: 32596 Attention mask shape: torch.Size([1, 1, 32596, 32596]) Position ids shape: torch.Size([1, 32596]) Input IDs shape: torch.Size([1, 32596]) Labels shape: torch.Size([1, 32596]) Final batch size: 1, sequence length: 33482 Attention mask shape: torch.Size([1, 1, 33482, 33482]) Position ids shape: torch.Size([1, 33482]) Input IDs shape: torch.Size([1, 33482]) Labels shape: torch.Size([1, 33482]) Final batch size: 1, sequence length: 35100 Attention mask shape: torch.Size([1, 1, 35100, 35100]) Position ids shape: torch.Size([1, 35100]) Input IDs shape: torch.Size([1, 35100]) Labels shape: torch.Size([1, 35100]) Final batch size: 1, sequence length: 35030 Attention mask shape: torch.Size([1, 1, 35030, 35030]) Position ids shape: torch.Size([1, 35030]) Input IDs shape: torch.Size([1, 35030]) Labels shape: torch.Size([1, 35030]) Final batch size: 1, sequence length: 35523 Attention mask shape: torch.Size([1, 1, 35523, 35523]) Position ids shape: torch.Size([1, 35523]) Input IDs shape: torch.Size([1, 35523]) Labels shape: torch.Size([1, 35523]) Final batch size: 1, sequence length: 35485 Attention mask shape: torch.Size([1, 1, 35485, 35485]) Position ids shape: torch.Size([1, 35485]) Input IDs shape: torch.Size([1, 35485]) Labels shape: torch.Size([1, 35485]) Final batch size: 1, sequence length: 34469 Attention mask shape: torch.Size([1, 1, 34469, 34469]) Position ids shape: torch.Size([1, 34469]) Input IDs shape: torch.Size([1, 34469]) Labels shape: torch.Size([1, 34469]) Final batch size: 1, sequence length: 35866 Attention mask shape: torch.Size([1, 1, 35866, 35866]) Position ids shape: torch.Size([1, 35866]) Input IDs shape: torch.Size([1, 35866]) Labels shape: torch.Size([1, 35866]) Final batch size: 1, sequence length: 37980 Attention mask shape: torch.Size([1, 1, 37980, 37980]) Position ids shape: torch.Size([1, 37980]) Input IDs shape: torch.Size([1, 37980]) Labels shape: torch.Size([1, 37980]) Final batch size: 1, sequence length: 39589 Attention mask shape: torch.Size([1, 1, 39589, 39589]) Position ids shape: torch.Size([1, 39589]) Input IDs shape: torch.Size([1, 39589]) Labels shape: torch.Size([1, 39589]) Final batch size: 1, sequence length: 40526 Attention mask shape: torch.Size([1, 1, 40526, 40526]) Position ids shape: torch.Size([1, 40526]) Input IDs shape: torch.Size([1, 40526]) Labels shape: torch.Size([1, 40526]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 37040 Attention mask shape: torch.Size([1, 1, 37040, 37040]) Position ids shape: torch.Size([1, 37040]) Input IDs shape: torch.Size([1, 37040]) Labels shape: torch.Size([1, 37040]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 38927 Attention mask shape: torch.Size([1, 1, 38927, 38927]) Position ids shape: torch.Size([1, 38927]) Input IDs shape: torch.Size([1, 38927]) Labels shape: torch.Size([1, 38927]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) {'loss': 0.3081, 'grad_norm': 0.46068702911292303, 'learning_rate': 9.193352839727122e-06, 'num_tokens': -inf, 'epoch': 2.0} Final batch size: 1, sequence length: 5686 Attention mask shape: torch.Size([1, 1, 5686, 5686]) Position ids shape: torch.Size([1, 5686]) Input IDs shape: torch.Size([1, 5686]) Labels shape: torch.Size([1, 5686]) Final batch size: 1, sequence length: 6986 Attention mask shape: torch.Size([1, 1, 6986, 6986]) Position ids shape: torch.Size([1, 6986]) Input IDs shape: torch.Size([1, 6986]) Labels shape: torch.Size([1, 6986]) Final batch size: 1, sequence length: 6362 Attention mask shape: torch.Size([1, 1, 6362, 6362]) Position ids shape: torch.Size([1, 6362]) Input IDs shape: torch.Size([1, 6362]) Labels shape: torch.Size([1, 6362]) Final batch size: 1, sequence length: 10523 Attention mask shape: torch.Size([1, 1, 10523, 10523]) Position ids shape: torch.Size([1, 10523]) Input IDs shape: torch.Size([1, 10523]) Labels shape: torch.Size([1, 10523]) Final batch size: 1, sequence length: 11304 Attention mask shape: torch.Size([1, 1, 11304, 11304]) Position ids shape: torch.Size([1, 11304]) Input IDs shape: torch.Size([1, 11304]) Labels shape: torch.Size([1, 11304]) Final batch size: 1, sequence length: 13224 Attention mask shape: torch.Size([1, 1, 13224, 13224]) Position ids shape: torch.Size([1, 13224]) Input IDs shape: torch.Size([1, 13224]) Labels shape: torch.Size([1, 13224]) Final batch size: 1, sequence length: 12847 Attention mask shape: torch.Size([1, 1, 12847, 12847]) Position ids shape: torch.Size([1, 12847]) Input IDs shape: torch.Size([1, 12847]) Labels shape: torch.Size([1, 12847]) Final batch size: 1, sequence length: 14304 Attention mask shape: torch.Size([1, 1, 14304, 14304]) Position ids shape: torch.Size([1, 14304]) Input IDs shape: torch.Size([1, 14304]) Labels shape: torch.Size([1, 14304]) Final batch size: 1, sequence length: 14168 Attention mask shape: torch.Size([1, 1, 14168, 14168]) Position ids shape: torch.Size([1, 14168]) Input IDs shape: torch.Size([1, 14168]) Labels shape: torch.Size([1, 14168]) Final batch size: 1, sequence length: 13872 Attention mask shape: torch.Size([1, 1, 13872, 13872]) Position ids shape: torch.Size([1, 13872]) Input IDs shape: torch.Size([1, 13872]) Labels shape: torch.Size([1, 13872]) Final batch size: 1, sequence length: 15652 Attention mask shape: torch.Size([1, 1, 15652, 15652]) Position ids shape: torch.Size([1, 15652]) Input IDs shape: torch.Size([1, 15652]) Labels shape: torch.Size([1, 15652]) Final batch size: 1, sequence length: 16514 Attention mask shape: torch.Size([1, 1, 16514, 16514]) Position ids shape: torch.Size([1, 16514]) Input IDs shape: torch.Size([1, 16514]) Labels shape: torch.Size([1, 16514]) Final batch size: 1, sequence length: 16501 Attention mask shape: torch.Size([1, 1, 16501, 16501]) Position ids shape: torch.Size([1, 16501]) Input IDs shape: torch.Size([1, 16501]) Labels shape: torch.Size([1, 16501]) Final batch size: 1, sequence length: 15621 Attention mask shape: torch.Size([1, 1, 15621, 15621]) Position ids shape: torch.Size([1, 15621]) Input IDs shape: torch.Size([1, 15621]) Labels shape: torch.Size([1, 15621]) Final batch size: 1, sequence length: 16440 Attention mask shape: torch.Size([1, 1, 16440, 16440]) Position ids shape: torch.Size([1, 16440]) Input IDs shape: torch.Size([1, 16440]) Labels shape: torch.Size([1, 16440]) Final batch size: 1, sequence length: 18246 Attention mask shape: torch.Size([1, 1, 18246, 18246]) Position ids shape: torch.Size([1, 18246]) Input IDs shape: torch.Size([1, 18246]) Labels shape: torch.Size([1, 18246]) Final batch size: 1, sequence length: 16378 Attention mask shape: torch.Size([1, 1, 16378, 16378]) Position ids shape: torch.Size([1, 16378]) Input IDs shape: torch.Size([1, 16378]) Labels shape: torch.Size([1, 16378]) Final batch size: 1, sequence length: 18044 Attention mask shape: torch.Size([1, 1, 18044, 18044]) Position ids shape: torch.Size([1, 18044]) Input IDs shape: torch.Size([1, 18044]) Labels shape: torch.Size([1, 18044]) Final batch size: 1, sequence length: 17213 Attention mask shape: torch.Size([1, 1, 17213, 17213]) Position ids shape: torch.Size([1, 17213]) Input IDs shape: torch.Size([1, 17213]) Labels shape: torch.Size([1, 17213]) Final batch size: 1, sequence length: 17852 Attention mask shape: torch.Size([1, 1, 17852, 17852]) Position ids shape: torch.Size([1, 17852]) Input IDs shape: torch.Size([1, 17852]) Labels shape: torch.Size([1, 17852]) Final batch size: 1, sequence length: 18289 Attention mask shape: torch.Size([1, 1, 18289, 18289]) Position ids shape: torch.Size([1, 18289]) Input IDs shape: torch.Size([1, 18289]) Labels shape: torch.Size([1, 18289]) Final batch size: 1, sequence length: 18155 Attention mask shape: torch.Size([1, 1, 18155, 18155]) Position ids shape: torch.Size([1, 18155]) Input IDs shape: torch.Size([1, 18155]) Labels shape: torch.Size([1, 18155]) Final batch size: 1, sequence length: 17773 Attention mask shape: torch.Size([1, 1, 17773, 17773]) Position ids shape: torch.Size([1, 17773]) Input IDs shape: torch.Size([1, 17773]) Labels shape: torch.Size([1, 17773]) Final batch size: 1, sequence length: 20176 Attention mask shape: torch.Size([1, 1, 20176, 20176]) Position ids shape: torch.Size([1, 20176]) Input IDs shape: torch.Size([1, 20176]) Labels shape: torch.Size([1, 20176]) Final batch size: 1, sequence length: 21163 Attention mask shape: torch.Size([1, 1, 21163, 21163]) Position ids shape: torch.Size([1, 21163]) Input IDs shape: torch.Size([1, 21163]) Labels shape: torch.Size([1, 21163]) Final batch size: 1, sequence length: 20813 Attention mask shape: torch.Size([1, 1, 20813, 20813]) Position ids shape: torch.Size([1, 20813]) Input IDs shape: torch.Size([1, 20813]) Labels shape: torch.Size([1, 20813]) Final batch size: 1, sequence length: 21562 Attention mask shape: torch.Size([1, 1, 21562, 21562]) Position ids shape: torch.Size([1, 21562]) Input IDs shape: torch.Size([1, 21562]) Labels shape: torch.Size([1, 21562]) Final batch size: 1, sequence length: 21705 Attention mask shape: torch.Size([1, 1, 21705, 21705]) Position ids shape: torch.Size([1, 21705]) Input IDs shape: torch.Size([1, 21705]) Labels shape: torch.Size([1, 21705]) Final batch size: 1, sequence length: 21607 Attention mask shape: torch.Size([1, 1, 21607, 21607]) Position ids shape: torch.Size([1, 21607]) Input IDs shape: torch.Size([1, 21607]) Labels shape: torch.Size([1, 21607]) Final batch size: 1, sequence length: 22152 Attention mask shape: torch.Size([1, 1, 22152, 22152]) Position ids shape: torch.Size([1, 22152]) Input IDs shape: torch.Size([1, 22152]) Labels shape: torch.Size([1, 22152]) Final batch size: 1, sequence length: 22197 Attention mask shape: torch.Size([1, 1, 22197, 22197]) Position ids shape: torch.Size([1, 22197]) Input IDs shape: torch.Size([1, 22197]) Labels shape: torch.Size([1, 22197]) Final batch size: 1, sequence length: 5911 Attention mask shape: torch.Size([1, 1, 5911, 5911]) Position ids shape: torch.Size([1, 5911]) Input IDs shape: torch.Size([1, 5911]) Labels shape: torch.Size([1, 5911]) Final batch size: 1, sequence length: 21316 Attention mask shape: torch.Size([1, 1, 21316, 21316]) Position ids shape: torch.Size([1, 21316]) Input IDs shape: torch.Size([1, 21316]) Labels shape: torch.Size([1, 21316]) Final batch size: 1, sequence length: 20766 Attention mask shape: torch.Size([1, 1, 20766, 20766]) Position ids shape: torch.Size([1, 20766]) Input IDs shape: torch.Size([1, 20766]) Labels shape: torch.Size([1, 20766]) Final batch size: 1, sequence length: 18545 Attention mask shape: torch.Size([1, 1, 18545, 18545]) Position ids shape: torch.Size([1, 18545]) Input IDs shape: torch.Size([1, 18545]) Labels shape: torch.Size([1, 18545]) Final batch size: 1, sequence length: 17910 Attention mask shape: torch.Size([1, 1, 17910, 17910]) Position ids shape: torch.Size([1, 17910]) Input IDs shape: torch.Size([1, 17910]) Labels shape: torch.Size([1, 17910]) Final batch size: 1, sequence length: 21207 Attention mask shape: torch.Size([1, 1, 21207, 21207]) Position ids shape: torch.Size([1, 21207]) Input IDs shape: torch.Size([1, 21207]) Labels shape: torch.Size([1, 21207]) Final batch size: 1, sequence length: 22784 Attention mask shape: torch.Size([1, 1, 22784, 22784]) Position ids shape: torch.Size([1, 22784]) Input IDs shape: torch.Size([1, 22784]) Labels shape: torch.Size([1, 22784]) Final batch size: 1, sequence length: 22117 Attention mask shape: torch.Size([1, 1, 22117, 22117]) Position ids shape: torch.Size([1, 22117]) Input IDs shape: torch.Size([1, 22117]) Labels shape: torch.Size([1, 22117]) Final batch size: 1, sequence length: 14496 Attention mask shape: torch.Size([1, 1, 14496, 14496]) Position ids shape: torch.Size([1, 14496]) Input IDs shape: torch.Size([1, 14496]) Labels shape: torch.Size([1, 14496]) Final batch size: 1, sequence length: 25053 Attention mask shape: torch.Size([1, 1, 25053, 25053]) Position ids shape: torch.Size([1, 25053]) Input IDs shape: torch.Size([1, 25053]) Labels shape: torch.Size([1, 25053]) Final batch size: 1, sequence length: 21962 Attention mask shape: torch.Size([1, 1, 21962, 21962]) Position ids shape: torch.Size([1, 21962]) Input IDs shape: torch.Size([1, 21962]) Labels shape: torch.Size([1, 21962]) Final batch size: 1, sequence length: 12421 Attention mask shape: torch.Size([1, 1, 12421, 12421]) Position ids shape: torch.Size([1, 12421]) Input IDs shape: torch.Size([1, 12421]) Labels shape: torch.Size([1, 12421]) Final batch size: 1, sequence length: 19962 Attention mask shape: torch.Size([1, 1, 19962, 19962]) Position ids shape: torch.Size([1, 19962]) Input IDs shape: torch.Size([1, 19962]) Labels shape: torch.Size([1, 19962]) Final batch size: 1, sequence length: 26494 Attention mask shape: torch.Size([1, 1, 26494, 26494]) Position ids shape: torch.Size([1, 26494]) Input IDs shape: torch.Size([1, 26494]) Labels shape: torch.Size([1, 26494]) Final batch size: 1, sequence length: 23338 Attention mask shape: torch.Size([1, 1, 23338, 23338]) Position ids shape: torch.Size([1, 23338]) Input IDs shape: torch.Size([1, 23338]) Labels shape: torch.Size([1, 23338]) Final batch size: 1, sequence length: 26921 Attention mask shape: torch.Size([1, 1, 26921, 26921]) Position ids shape: torch.Size([1, 26921]) Input IDs shape: torch.Size([1, 26921]) Labels shape: torch.Size([1, 26921]) Final batch size: 1, sequence length: 13031 Attention mask shape: torch.Size([1, 1, 13031, 13031]) Position ids shape: torch.Size([1, 13031]) Input IDs shape: torch.Size([1, 13031]) Labels shape: torch.Size([1, 13031]) Final batch size: 1, sequence length: 11515 Attention mask shape: torch.Size([1, 1, 11515, 11515]) Position ids shape: torch.Size([1, 11515]) Input IDs shape: torch.Size([1, 11515]) Labels shape: torch.Size([1, 11515]) Final batch size: 1, sequence length: 21672 Attention mask shape: torch.Size([1, 1, 21672, 21672]) Position ids shape: torch.Size([1, 21672]) Input IDs shape: torch.Size([1, 21672]) Labels shape: torch.Size([1, 21672]) Final batch size: 1, sequence length: 26303 Attention mask shape: torch.Size([1, 1, 26303, 26303]) Position ids shape: torch.Size([1, 26303]) Input IDs shape: torch.Size([1, 26303]) Labels shape: torch.Size([1, 26303]) Final batch size: 1, sequence length: 27633 Attention mask shape: torch.Size([1, 1, 27633, 27633]) Position ids shape: torch.Size([1, 27633]) Input IDs shape: torch.Size([1, 27633]) Labels shape: torch.Size([1, 27633]) Final batch size: 1, sequence length: 24365 Attention mask shape: torch.Size([1, 1, 24365, 24365]) Position ids shape: torch.Size([1, 24365]) Input IDs shape: torch.Size([1, 24365]) Labels shape: torch.Size([1, 24365]) Final batch size: 1, sequence length: 18470 Attention mask shape: torch.Size([1, 1, 18470, 18470]) Position ids shape: torch.Size([1, 18470]) Input IDs shape: torch.Size([1, 18470]) Labels shape: torch.Size([1, 18470]) Final batch size: 1, sequence length: 20101 Attention mask shape: torch.Size([1, 1, 20101, 20101]) Position ids shape: torch.Size([1, 20101]) Input IDs shape: torch.Size([1, 20101]) Labels shape: torch.Size([1, 20101]) Final batch size: 1, sequence length: 23975 Attention mask shape: torch.Size([1, 1, 23975, 23975]) Position ids shape: torch.Size([1, 23975]) Input IDs shape: torch.Size([1, 23975]) Labels shape: torch.Size([1, 23975]) Final batch size: 1, sequence length: 12224 Attention mask shape: torch.Size([1, 1, 12224, 12224]) Position ids shape: torch.Size([1, 12224]) Input IDs shape: torch.Size([1, 12224]) Labels shape: torch.Size([1, 12224]) Final batch size: 1, sequence length: 9704 Attention mask shape: torch.Size([1, 1, 9704, 9704]) Position ids shape: torch.Size([1, 9704]) Input IDs shape: torch.Size([1, 9704]) Labels shape: torch.Size([1, 9704]) Final batch size: 1, sequence length: 28523 Attention mask shape: torch.Size([1, 1, 28523, 28523]) Position ids shape: torch.Size([1, 28523]) Input IDs shape: torch.Size([1, 28523]) Labels shape: torch.Size([1, 28523]) Final batch size: 1, sequence length: 16057 Attention mask shape: torch.Size([1, 1, 16057, 16057]) Position ids shape: torch.Size([1, 16057]) Input IDs shape: torch.Size([1, 16057]) Labels shape: torch.Size([1, 16057]) Final batch size: 1, sequence length: 26138 Attention mask shape: torch.Size([1, 1, 26138, 26138]) Position ids shape: torch.Size([1, 26138]) Input IDs shape: torch.Size([1, 26138]) Labels shape: torch.Size([1, 26138]) Final batch size: 1, sequence length: 32473 Attention mask shape: torch.Size([1, 1, 32473, 32473]) Position ids shape: torch.Size([1, 32473]) Input IDs shape: torch.Size([1, 32473]) Labels shape: torch.Size([1, 32473]) Final batch size: 1, sequence length: 29404 Attention mask shape: torch.Size([1, 1, 29404, 29404]) Position ids shape: torch.Size([1, 29404]) Input IDs shape: torch.Size([1, 29404]) Labels shape: torch.Size([1, 29404]) Final batch size: 1, sequence length: 20198 Attention mask shape: torch.Size([1, 1, 20198, 20198]) Position ids shape: torch.Size([1, 20198]) Input IDs shape: torch.Size([1, 20198]) Labels shape: torch.Size([1, 20198]) Final batch size: 1, sequence length: 21581 Attention mask shape: torch.Size([1, 1, 21581, 21581]) Position ids shape: torch.Size([1, 21581]) Input IDs shape: torch.Size([1, 21581]) Labels shape: torch.Size([1, 21581]) Final batch size: 1, sequence length: 28897 Attention mask shape: torch.Size([1, 1, 28897, 28897]) Position ids shape: torch.Size([1, 28897]) Input IDs shape: torch.Size([1, 28897]) Labels shape: torch.Size([1, 28897]) Final batch size: 1, sequence length: 25880 Attention mask shape: torch.Size([1, 1, 25880, 25880]) Position ids shape: torch.Size([1, 25880]) Input IDs shape: torch.Size([1, 25880]) Labels shape: torch.Size([1, 25880]) Final batch size: 1, sequence length: 21766 Attention mask shape: torch.Size([1, 1, 21766, 21766]) Position ids shape: torch.Size([1, 21766]) Input IDs shape: torch.Size([1, 21766]) Labels shape: torch.Size([1, 21766]) Final batch size: 1, sequence length: 20184 Attention mask shape: torch.Size([1, 1, 20184, 20184]) Position ids shape: torch.Size([1, 20184]) Input IDs shape: torch.Size([1, 20184]) Labels shape: torch.Size([1, 20184]) Final batch size: 1, sequence length: 30592 Attention mask shape: torch.Size([1, 1, 30592, 30592]) Position ids shape: torch.Size([1, 30592]) Input IDs shape: torch.Size([1, 30592]) Labels shape: torch.Size([1, 30592]) Final batch size: 1, sequence length: 17456 Attention mask shape: torch.Size([1, 1, 17456, 17456]) Position ids shape: torch.Size([1, 17456]) Input IDs shape: torch.Size([1, 17456]) Labels shape: torch.Size([1, 17456]) Final batch size: 1, sequence length: 23558 Attention mask shape: torch.Size([1, 1, 23558, 23558]) Position ids shape: torch.Size([1, 23558]) Input IDs shape: torch.Size([1, 23558]) Labels shape: torch.Size([1, 23558]) Final batch size: 1, sequence length: 35100 Attention mask shape: torch.Size([1, 1, 35100, 35100]) Position ids shape: torch.Size([1, 35100]) Input IDs shape: torch.Size([1, 35100]) Labels shape: torch.Size([1, 35100]) Final batch size: 1, sequence length: 35485 Attention mask shape: torch.Size([1, 1, 35485, 35485]) Position ids shape: torch.Size([1, 35485]) Input IDs shape: torch.Size([1, 35485]) Labels shape: torch.Size([1, 35485]) Final batch size: 1, sequence length: 30965 Attention mask shape: torch.Size([1, 1, 30965, 30965]) Position ids shape: torch.Size([1, 30965]) Input IDs shape: torch.Size([1, 30965]) Labels shape: torch.Size([1, 30965]) Final batch size: 1, sequence length: 33482 Attention mask shape: torch.Size([1, 1, 33482, 33482]) Position ids shape: torch.Size([1, 33482]) Input IDs shape: torch.Size([1, 33482]) Labels shape: torch.Size([1, 33482]) Final batch size: 1, sequence length: 34469 Attention mask shape: torch.Size([1, 1, 34469, 34469]) Position ids shape: torch.Size([1, 34469]) Input IDs shape: torch.Size([1, 34469]) Labels shape: torch.Size([1, 34469]) Final batch size: 1, sequence length: 29481 Attention mask shape: torch.Size([1, 1, 29481, 29481]) Position ids shape: torch.Size([1, 29481]) Input IDs shape: torch.Size([1, 29481]) Labels shape: torch.Size([1, 29481]) Final batch size: 1, sequence length: 19620 Attention mask shape: torch.Size([1, 1, 19620, 19620]) Position ids shape: torch.Size([1, 19620]) Input IDs shape: torch.Size([1, 19620]) Labels shape: torch.Size([1, 19620]) Final batch size: 1, sequence length: 32596 Attention mask shape: torch.Size([1, 1, 32596, 32596]) Position ids shape: torch.Size([1, 32596]) Input IDs shape: torch.Size([1, 32596]) Labels shape: torch.Size([1, 32596]) Final batch size: 1, sequence length: 24121 Attention mask shape: torch.Size([1, 1, 24121, 24121]) Position ids shape: torch.Size([1, 24121]) Input IDs shape: torch.Size([1, 24121]) Labels shape: torch.Size([1, 24121]) Final batch size: 1, sequence length: 18126 Attention mask shape: torch.Size([1, 1, 18126, 18126]) Position ids shape: torch.Size([1, 18126]) Input IDs shape: torch.Size([1, 18126]) Labels shape: torch.Size([1, 18126]) Final batch size: 1, sequence length: 28773 Attention mask shape: torch.Size([1, 1, 28773, 28773]) Position ids shape: torch.Size([1, 28773]) Input IDs shape: torch.Size([1, 28773]) Labels shape: torch.Size([1, 28773]) Final batch size: 1, sequence length: 16257 Attention mask shape: torch.Size([1, 1, 16257, 16257]) Position ids shape: torch.Size([1, 16257]) Input IDs shape: torch.Size([1, 16257]) Labels shape: torch.Size([1, 16257]) Final batch size: 1, sequence length: 21491 Attention mask shape: torch.Size([1, 1, 21491, 21491]) Position ids shape: torch.Size([1, 21491]) Input IDs shape: torch.Size([1, 21491]) Labels shape: torch.Size([1, 21491]) Final batch size: 1, sequence length: 33685 Attention mask shape: torch.Size([1, 1, 33685, 33685]) Position ids shape: torch.Size([1, 33685]) Input IDs shape: torch.Size([1, 33685]) Labels shape: torch.Size([1, 33685]) Final batch size: 1, sequence length: 37980 Attention mask shape: torch.Size([1, 1, 37980, 37980]) Position ids shape: torch.Size([1, 37980]) Input IDs shape: torch.Size([1, 37980]) Labels shape: torch.Size([1, 37980]) Final batch size: 1, sequence length: 18454 Attention mask shape: torch.Size([1, 1, 18454, 18454]) Position ids shape: torch.Size([1, 18454]) Input IDs shape: torch.Size([1, 18454]) Labels shape: torch.Size([1, 18454]) Final batch size: 1, sequence length: 37040 Attention mask shape: torch.Size([1, 1, 37040, 37040]) Position ids shape: torch.Size([1, 37040]) Input IDs shape: torch.Size([1, 37040]) Labels shape: torch.Size([1, 37040]) Final batch size: 1, sequence length: 30428 Attention mask shape: torch.Size([1, 1, 30428, 30428]) Position ids shape: torch.Size([1, 30428]) Input IDs shape: torch.Size([1, 30428]) Labels shape: torch.Size([1, 30428]) Final batch size: 1, sequence length: 40526 Attention mask shape: torch.Size([1, 1, 40526, 40526]) Position ids shape: torch.Size([1, 40526]) Input IDs shape: torch.Size([1, 40526]) Labels shape: torch.Size([1, 40526]) Final batch size: 1, sequence length: 29537 Attention mask shape: torch.Size([1, 1, 29537, 29537]) Position ids shape: torch.Size([1, 29537]) Input IDs shape: torch.Size([1, 29537]) Labels shape: torch.Size([1, 29537]) Final batch size: 1, sequence length: 17870 Attention mask shape: torch.Size([1, 1, 17870, 17870]) Position ids shape: torch.Size([1, 17870]) Input IDs shape: torch.Size([1, 17870]) Labels shape: torch.Size([1, 17870]) Final batch size: 1, sequence length: 35030 Attention mask shape: torch.Size([1, 1, 35030, 35030]) Position ids shape: torch.Size([1, 35030]) Input IDs shape: torch.Size([1, 35030]) Labels shape: torch.Size([1, 35030]) Final batch size: 1, sequence length: 11608 Attention mask shape: torch.Size([1, 1, 11608, 11608]) Position ids shape: torch.Size([1, 11608]) Input IDs shape: torch.Size([1, 11608]) Labels shape: torch.Size([1, 11608]) Final batch size: 1, sequence length: 26215 Attention mask shape: torch.Size([1, 1, 26215, 26215]) Position ids shape: torch.Size([1, 26215]) Input IDs shape: torch.Size([1, 26215]) Labels shape: torch.Size([1, 26215]) Final batch size: 1, sequence length: 13622 Attention mask shape: torch.Size([1, 1, 13622, 13622]) Position ids shape: torch.Size([1, 13622]) Input IDs shape: torch.Size([1, 13622]) Labels shape: torch.Size([1, 13622]) Final batch size: 1, sequence length: 18023 Attention mask shape: torch.Size([1, 1, 18023, 18023]) Position ids shape: torch.Size([1, 18023]) Input IDs shape: torch.Size([1, 18023]) Labels shape: torch.Size([1, 18023]) Final batch size: 1, sequence length: 30072 Attention mask shape: torch.Size([1, 1, 30072, 30072]) Position ids shape: torch.Size([1, 30072]) Input IDs shape: torch.Size([1, 30072]) Labels shape: torch.Size([1, 30072]) Final batch size: 1, sequence length: 27702 Attention mask shape: torch.Size([1, 1, 27702, 27702]) Position ids shape: torch.Size([1, 27702]) Input IDs shape: torch.Size([1, 27702]) Labels shape: torch.Size([1, 27702]) Final batch size: 1, sequence length: 30789 Attention mask shape: torch.Size([1, 1, 30789, 30789]) Position ids shape: torch.Size([1, 30789]) Input IDs shape: torch.Size([1, 30789]) Labels shape: torch.Size([1, 30789]) Final batch size: 1, sequence length: 21250 Attention mask shape: torch.Size([1, 1, 21250, 21250]) Position ids shape: torch.Size([1, 21250]) Input IDs shape: torch.Size([1, 21250]) Labels shape: torch.Size([1, 21250]) Final batch size: 1, sequence length: 26635 Attention mask shape: torch.Size([1, 1, 26635, 26635]) Position ids shape: torch.Size([1, 26635]) Input IDs shape: torch.Size([1, 26635]) Labels shape: torch.Size([1, 26635]) Final batch size: 1, sequence length: 38927 Attention mask shape: torch.Size([1, 1, 38927, 38927]) Position ids shape: torch.Size([1, 38927]) Input IDs shape: torch.Size([1, 38927]) Labels shape: torch.Size([1, 38927]) Final batch size: 1, sequence length: 26708 Attention mask shape: torch.Size([1, 1, 26708, 26708]) Position ids shape: torch.Size([1, 26708]) Input IDs shape: torch.Size([1, 26708]) Labels shape: torch.Size([1, 26708]) Final batch size: 1, sequence length: 39589 Attention mask shape: torch.Size([1, 1, 39589, 39589]) Position ids shape: torch.Size([1, 39589]) Input IDs shape: torch.Size([1, 39589]) Labels shape: torch.Size([1, 39589]) Final batch size: 1, sequence length: 29875 Attention mask shape: torch.Size([1, 1, 29875, 29875]) Position ids shape: torch.Size([1, 29875]) Input IDs shape: torch.Size([1, 29875]) Labels shape: torch.Size([1, 29875]) Final batch size: 1, sequence length: 19702 Attention mask shape: torch.Size([1, 1, 19702, 19702]) Position ids shape: torch.Size([1, 19702]) Input IDs shape: torch.Size([1, 19702]) Labels shape: torch.Size([1, 19702]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 30772 Attention mask shape: torch.Size([1, 1, 30772, 30772]) Position ids shape: torch.Size([1, 30772]) Input IDs shape: torch.Size([1, 30772]) Labels shape: torch.Size([1, 30772]) Final batch size: 1, sequence length: 22309 Attention mask shape: torch.Size([1, 1, 22309, 22309]) Position ids shape: torch.Size([1, 22309]) Input IDs shape: torch.Size([1, 22309]) Labels shape: torch.Size([1, 22309]) Final batch size: 1, sequence length: 11403 Attention mask shape: torch.Size([1, 1, 11403, 11403]) Position ids shape: torch.Size([1, 11403]) Input IDs shape: torch.Size([1, 11403]) Labels shape: torch.Size([1, 11403]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 19538 Attention mask shape: torch.Size([1, 1, 19538, 19538]) Position ids shape: torch.Size([1, 19538]) Input IDs shape: torch.Size([1, 19538]) Labels shape: torch.Size([1, 19538]) Final batch size: 1, sequence length: 37992 Attention mask shape: torch.Size([1, 1, 37992, 37992]) Position ids shape: torch.Size([1, 37992]) Input IDs shape: torch.Size([1, 37992]) Labels shape: torch.Size([1, 37992]) Final batch size: 1, sequence length: 15830 Attention mask shape: torch.Size([1, 1, 15830, 15830]) Position ids shape: torch.Size([1, 15830]) Input IDs shape: torch.Size([1, 15830]) Labels shape: torch.Size([1, 15830]) Final batch size: 1, sequence length: 36580 Attention mask shape: torch.Size([1, 1, 36580, 36580]) Position ids shape: torch.Size([1, 36580]) Input IDs shape: torch.Size([1, 36580]) Labels shape: torch.Size([1, 36580]) Final batch size: 1, sequence length: 30859 Attention mask shape: torch.Size([1, 1, 30859, 30859]) Position ids shape: torch.Size([1, 30859]) Input IDs shape: torch.Size([1, 30859]) Labels shape: torch.Size([1, 30859]) Final batch size: 1, sequence length: 14730 Attention mask shape: torch.Size([1, 1, 14730, 14730]) Position ids shape: torch.Size([1, 14730]) Input IDs shape: torch.Size([1, 14730]) Labels shape: torch.Size([1, 14730]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 21936 Attention mask shape: torch.Size([1, 1, 21936, 21936]) Position ids shape: torch.Size([1, 21936]) Input IDs shape: torch.Size([1, 21936]) Labels shape: torch.Size([1, 21936]) Final batch size: 1, sequence length: 35077 Attention mask shape: torch.Size([1, 1, 35077, 35077]) Position ids shape: torch.Size([1, 35077]) Input IDs shape: torch.Size([1, 35077]) Labels shape: torch.Size([1, 35077]) Final batch size: 1, sequence length: 34684 Attention mask shape: torch.Size([1, 1, 34684, 34684]) Position ids shape: torch.Size([1, 34684]) Input IDs shape: torch.Size([1, 34684]) Labels shape: torch.Size([1, 34684]) Final batch size: 1, sequence length: 15993 Attention mask shape: torch.Size([1, 1, 15993, 15993]) Position ids shape: torch.Size([1, 15993]) Input IDs shape: torch.Size([1, 15993]) Labels shape: torch.Size([1, 15993]) Final batch size: 1, sequence length: 17811 Attention mask shape: torch.Size([1, 1, 17811, 17811]) Position ids shape: torch.Size([1, 17811]) Input IDs shape: torch.Size([1, 17811]) Labels shape: torch.Size([1, 17811]) Final batch size: 1, sequence length: 17623 Attention mask shape: torch.Size([1, 1, 17623, 17623]) Position ids shape: torch.Size([1, 17623]) Input IDs shape: torch.Size([1, 17623]) Labels shape: torch.Size([1, 17623]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 17971 Attention mask shape: torch.Size([1, 1, 17971, 17971]) Position ids shape: torch.Size([1, 17971]) Input IDs shape: torch.Size([1, 17971]) Labels shape: torch.Size([1, 17971]) Final batch size: 1, sequence length: 31745 Attention mask shape: torch.Size([1, 1, 31745, 31745]) Position ids shape: torch.Size([1, 31745]) Input IDs shape: torch.Size([1, 31745]) Labels shape: torch.Size([1, 31745]) Final batch size: 1, sequence length: 18606 Attention mask shape: torch.Size([1, 1, 18606, 18606]) Position ids shape: torch.Size([1, 18606]) Input IDs shape: torch.Size([1, 18606]) Labels shape: torch.Size([1, 18606]) Final batch size: 1, sequence length: 39661 Attention mask shape: torch.Size([1, 1, 39661, 39661]) Position ids shape: torch.Size([1, 39661]) Input IDs shape: torch.Size([1, 39661]) Labels shape: torch.Size([1, 39661]) Final batch size: 1, sequence length: 29464 Attention mask shape: torch.Size([1, 1, 29464, 29464]) Position ids shape: torch.Size([1, 29464]) Input IDs shape: torch.Size([1, 29464]) Labels shape: torch.Size([1, 29464]) Final batch size: 1, sequence length: 26665 Attention mask shape: torch.Size([1, 1, 26665, 26665]) Position ids shape: torch.Size([1, 26665]) Input IDs shape: torch.Size([1, 26665]) Labels shape: torch.Size([1, 26665]) Final batch size: 1, sequence length: 29639 Attention mask shape: torch.Size([1, 1, 29639, 29639]) Position ids shape: torch.Size([1, 29639]) Input IDs shape: torch.Size([1, 29639]) Labels shape: torch.Size([1, 29639]) Final batch size: 1, sequence length: 28814 Attention mask shape: torch.Size([1, 1, 28814, 28814]) Position ids shape: torch.Size([1, 28814]) Input IDs shape: torch.Size([1, 28814]) Labels shape: torch.Size([1, 28814]) Final batch size: 1, sequence length: 28810 Attention mask shape: torch.Size([1, 1, 28810, 28810]) Position ids shape: torch.Size([1, 28810]) Input IDs shape: torch.Size([1, 28810]) Labels shape: torch.Size([1, 28810]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 16270 Attention mask shape: torch.Size([1, 1, 16270, 16270]) Position ids shape: torch.Size([1, 16270]) Input IDs shape: torch.Size([1, 16270]) Labels shape: torch.Size([1, 16270]) Final batch size: 1, sequence length: 13903 Attention mask shape: torch.Size([1, 1, 13903, 13903]) Position ids shape: torch.Size([1, 13903]) Input IDs shape: torch.Size([1, 13903]) Labels shape: torch.Size([1, 13903]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 36716 Attention mask shape: torch.Size([1, 1, 36716, 36716]) Position ids shape: torch.Size([1, 36716]) Input IDs shape: torch.Size([1, 36716]) Labels shape: torch.Size([1, 36716]) Final batch size: 1, sequence length: 22945 Attention mask shape: torch.Size([1, 1, 22945, 22945]) Position ids shape: torch.Size([1, 22945]) Input IDs shape: torch.Size([1, 22945]) Labels shape: torch.Size([1, 22945]) Final batch size: 1, sequence length: 19028 Attention mask shape: torch.Size([1, 1, 19028, 19028]) Position ids shape: torch.Size([1, 19028]) Input IDs shape: torch.Size([1, 19028]) Labels shape: torch.Size([1, 19028]) Final batch size: 1, sequence length: 17778 Attention mask shape: torch.Size([1, 1, 17778, 17778]) Position ids shape: torch.Size([1, 17778]) Input IDs shape: torch.Size([1, 17778]) Labels shape: torch.Size([1, 17778]) Final batch size: 1, sequence length: 31381 Attention mask shape: torch.Size([1, 1, 31381, 31381]) Position ids shape: torch.Size([1, 31381]) Input IDs shape: torch.Size([1, 31381]) Labels shape: torch.Size([1, 31381]) Final batch size: 1, sequence length: 26930 Attention mask shape: torch.Size([1, 1, 26930, 26930]) Position ids shape: torch.Size([1, 26930]) Input IDs shape: torch.Size([1, 26930]) Labels shape: torch.Size([1, 26930]) Final batch size: 1, sequence length: 37407 Attention mask shape: torch.Size([1, 1, 37407, 37407]) Position ids shape: torch.Size([1, 37407]) Input IDs shape: torch.Size([1, 37407]) Labels shape: torch.Size([1, 37407]) Final batch size: 1, sequence length: 35952 Attention mask shape: torch.Size([1, 1, 35952, 35952]) Position ids shape: torch.Size([1, 35952]) Input IDs shape: torch.Size([1, 35952]) Labels shape: torch.Size([1, 35952]) Final batch size: 1, sequence length: 30135 Attention mask shape: torch.Size([1, 1, 30135, 30135]) Position ids shape: torch.Size([1, 30135]) Input IDs shape: torch.Size([1, 30135]) Labels shape: torch.Size([1, 30135]) Final batch size: 1, sequence length: 24232 Attention mask shape: torch.Size([1, 1, 24232, 24232]) Position ids shape: torch.Size([1, 24232]) Input IDs shape: torch.Size([1, 24232]) Labels shape: torch.Size([1, 24232]) Final batch size: 1, sequence length: 20765 Attention mask shape: torch.Size([1, 1, 20765, 20765]) Position ids shape: torch.Size([1, 20765]) Input IDs shape: torch.Size([1, 20765]) Labels shape: torch.Size([1, 20765]) Final batch size: 1, sequence length: 37866 Attention mask shape: torch.Size([1, 1, 37866, 37866]) Position ids shape: torch.Size([1, 37866]) Input IDs shape: torch.Size([1, 37866]) Labels shape: torch.Size([1, 37866]) Final batch size: 1, sequence length: 27281 Attention mask shape: torch.Size([1, 1, 27281, 27281]) Position ids shape: torch.Size([1, 27281]) Input IDs shape: torch.Size([1, 27281]) Labels shape: torch.Size([1, 27281]) Final batch size: 1, sequence length: 20057 Attention mask shape: torch.Size([1, 1, 20057, 20057]) Position ids shape: torch.Size([1, 20057]) Input IDs shape: torch.Size([1, 20057]) Labels shape: torch.Size([1, 20057]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 21805 Attention mask shape: torch.Size([1, 1, 21805, 21805]) Position ids shape: torch.Size([1, 21805]) Input IDs shape: torch.Size([1, 21805]) Labels shape: torch.Size([1, 21805]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 31420 Attention mask shape: torch.Size([1, 1, 31420, 31420]) Position ids shape: torch.Size([1, 31420]) Input IDs shape: torch.Size([1, 31420]) Labels shape: torch.Size([1, 31420]) Final batch size: 1, sequence length: 32024 Attention mask shape: torch.Size([1, 1, 32024, 32024]) Position ids shape: torch.Size([1, 32024]) Input IDs shape: torch.Size([1, 32024]) Labels shape: torch.Size([1, 32024]) Final batch size: 1, sequence length: 21071 Attention mask shape: torch.Size([1, 1, 21071, 21071]) Position ids shape: torch.Size([1, 21071]) Input IDs shape: torch.Size([1, 21071]) Labels shape: torch.Size([1, 21071]) Final batch size: 1, sequence length: 25032 Attention mask shape: torch.Size([1, 1, 25032, 25032]) Position ids shape: torch.Size([1, 25032]) Input IDs shape: torch.Size([1, 25032]) Labels shape: torch.Size([1, 25032]) Final batch size: 1, sequence length: 37397 Attention mask shape: torch.Size([1, 1, 37397, 37397]) Position ids shape: torch.Size([1, 37397]) Input IDs shape: torch.Size([1, 37397]) Labels shape: torch.Size([1, 37397]) Final batch size: 1, sequence length: 14104 Attention mask shape: torch.Size([1, 1, 14104, 14104]) Position ids shape: torch.Size([1, 14104]) Input IDs shape: torch.Size([1, 14104]) Labels shape: torch.Size([1, 14104]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 21006 Attention mask shape: torch.Size([1, 1, 21006, 21006]) Position ids shape: torch.Size([1, 21006]) Input IDs shape: torch.Size([1, 21006]) Labels shape: torch.Size([1, 21006]) Final batch size: 1, sequence length: 25388 Attention mask shape: torch.Size([1, 1, 25388, 25388]) Position ids shape: torch.Size([1, 25388]) Input IDs shape: torch.Size([1, 25388]) Labels shape: torch.Size([1, 25388]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 35879 Attention mask shape: torch.Size([1, 1, 35879, 35879]) Position ids shape: torch.Size([1, 35879]) Input IDs shape: torch.Size([1, 35879]) Labels shape: torch.Size([1, 35879]) Final batch size: 1, sequence length: 18566 Attention mask shape: torch.Size([1, 1, 18566, 18566]) Position ids shape: torch.Size([1, 18566]) Input IDs shape: torch.Size([1, 18566]) Labels shape: torch.Size([1, 18566]) Final batch size: 1, sequence length: 40579 Attention mask shape: torch.Size([1, 1, 40579, 40579]) Position ids shape: torch.Size([1, 40579]) Input IDs shape: torch.Size([1, 40579]) Labels shape: torch.Size([1, 40579]) Final batch size: 1, sequence length: 36415 Attention mask shape: torch.Size([1, 1, 36415, 36415]) Position ids shape: torch.Size([1, 36415]) Input IDs shape: torch.Size([1, 36415]) Labels shape: torch.Size([1, 36415]) Final batch size: 1, sequence length: 25383 Attention mask shape: torch.Size([1, 1, 25383, 25383]) Position ids shape: torch.Size([1, 25383]) Input IDs shape: torch.Size([1, 25383]) Labels shape: torch.Size([1, 25383]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 23187 Attention mask shape: torch.Size([1, 1, 23187, 23187]) Position ids shape: torch.Size([1, 23187]) Input IDs shape: torch.Size([1, 23187]) Labels shape: torch.Size([1, 23187]) Final batch size: 1, sequence length: 37530 Attention mask shape: torch.Size([1, 1, 37530, 37530]) Position ids shape: torch.Size([1, 37530]) Input IDs shape: torch.Size([1, 37530]) Labels shape: torch.Size([1, 37530]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 39778 Attention mask shape: torch.Size([1, 1, 39778, 39778]) Position ids shape: torch.Size([1, 39778]) Input IDs shape: torch.Size([1, 39778]) Labels shape: torch.Size([1, 39778]) Final batch size: 1, sequence length: 12200 Attention mask shape: torch.Size([1, 1, 12200, 12200]) Position ids shape: torch.Size([1, 12200]) Input IDs shape: torch.Size([1, 12200]) Labels shape: torch.Size([1, 12200]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 32541 Attention mask shape: torch.Size([1, 1, 32541, 32541]) Position ids shape: torch.Size([1, 32541]) Input IDs shape: torch.Size([1, 32541]) Labels shape: torch.Size([1, 32541]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 19668 Attention mask shape: torch.Size([1, 1, 19668, 19668]) Position ids shape: torch.Size([1, 19668]) Input IDs shape: torch.Size([1, 19668]) Labels shape: torch.Size([1, 19668]) Final batch size: 1, sequence length: 39587 Attention mask shape: torch.Size([1, 1, 39587, 39587]) Position ids shape: torch.Size([1, 39587]) Input IDs shape: torch.Size([1, 39587]) Labels shape: torch.Size([1, 39587]) Final batch size: 1, sequence length: 39579 Attention mask shape: torch.Size([1, 1, 39579, 39579]) Position ids shape: torch.Size([1, 39579]) Input IDs shape: torch.Size([1, 39579]) Labels shape: torch.Size([1, 39579]) Final batch size: 1, sequence length: 9251 Attention mask shape: torch.Size([1, 1, 9251, 9251]) Position ids shape: torch.Size([1, 9251]) Input IDs shape: torch.Size([1, 9251]) Labels shape: torch.Size([1, 9251]) Final batch size: 1, sequence length: 37890 Attention mask shape: torch.Size([1, 1, 37890, 37890]) Position ids shape: torch.Size([1, 37890]) Input IDs shape: torch.Size([1, 37890]) Labels shape: torch.Size([1, 37890]) Final batch size: 1, sequence length: 14308 Attention mask shape: torch.Size([1, 1, 14308, 14308]) Position ids shape: torch.Size([1, 14308]) Input IDs shape: torch.Size([1, 14308]) Labels shape: torch.Size([1, 14308]) Final batch size: 1, sequence length: 36647 Attention mask shape: torch.Size([1, 1, 36647, 36647]) Position ids shape: torch.Size([1, 36647]) Input IDs shape: torch.Size([1, 36647]) Labels shape: torch.Size([1, 36647]) {'loss': 0.2958, 'grad_norm': 0.42782162373110677, 'learning_rate': 9.045084971874738e-06, 'num_tokens': -inf, 'epoch': 2.12} Final batch size: 1, sequence length: 4858 Attention mask shape: torch.Size([1, 1, 4858, 4858]) Position ids shape: torch.Size([1, 4858]) Input IDs shape: torch.Size([1, 4858]) Labels shape: torch.Size([1, 4858]) Final batch size: 1, sequence length: 6316 Attention mask shape: torch.Size([1, 1, 6316, 6316]) Position ids shape: torch.Size([1, 6316]) Input IDs shape: torch.Size([1, 6316]) Labels shape: torch.Size([1, 6316]) Final batch size: 1, sequence length: 7360 Attention mask shape: torch.Size([1, 1, 7360, 7360]) Position ids shape: torch.Size([1, 7360]) Input IDs shape: torch.Size([1, 7360]) Labels shape: torch.Size([1, 7360]) Final batch size: 1, sequence length: 11448 Attention mask shape: torch.Size([1, 1, 11448, 11448]) Position ids shape: torch.Size([1, 11448]) Input IDs shape: torch.Size([1, 11448]) Labels shape: torch.Size([1, 11448]) Final batch size: 1, sequence length: 12846 Attention mask shape: torch.Size([1, 1, 12846, 12846]) Position ids shape: torch.Size([1, 12846]) Input IDs shape: torch.Size([1, 12846]) Labels shape: torch.Size([1, 12846]) Final batch size: 1, sequence length: 12075 Attention mask shape: torch.Size([1, 1, 12075, 12075]) Position ids shape: torch.Size([1, 12075]) Input IDs shape: torch.Size([1, 12075]) Labels shape: torch.Size([1, 12075]) Final batch size: 1, sequence length: 12945 Attention mask shape: torch.Size([1, 1, 12945, 12945]) Position ids shape: torch.Size([1, 12945]) Input IDs shape: torch.Size([1, 12945]) Labels shape: torch.Size([1, 12945]) Final batch size: 1, sequence length: 15189 Attention mask shape: torch.Size([1, 1, 15189, 15189]) Position ids shape: torch.Size([1, 15189]) Input IDs shape: torch.Size([1, 15189]) Labels shape: torch.Size([1, 15189]) Final batch size: 1, sequence length: 14330 Attention mask shape: torch.Size([1, 1, 14330, 14330]) Position ids shape: torch.Size([1, 14330]) Input IDs shape: torch.Size([1, 14330]) Labels shape: torch.Size([1, 14330]) Final batch size: 1, sequence length: 16961 Attention mask shape: torch.Size([1, 1, 16961, 16961]) Position ids shape: torch.Size([1, 16961]) Input IDs shape: torch.Size([1, 16961]) Labels shape: torch.Size([1, 16961]) Final batch size: 1, sequence length: 17246 Attention mask shape: torch.Size([1, 1, 17246, 17246]) Position ids shape: torch.Size([1, 17246]) Input IDs shape: torch.Size([1, 17246]) Labels shape: torch.Size([1, 17246]) Final batch size: 1, sequence length: 16658 Attention mask shape: torch.Size([1, 1, 16658, 16658]) Position ids shape: torch.Size([1, 16658]) Input IDs shape: torch.Size([1, 16658]) Labels shape: torch.Size([1, 16658]) Final batch size: 1, sequence length: 14891 Attention mask shape: torch.Size([1, 1, 14891, 14891]) Position ids shape: torch.Size([1, 14891]) Input IDs shape: torch.Size([1, 14891]) Labels shape: torch.Size([1, 14891]) Final batch size: 1, sequence length: 16391 Attention mask shape: torch.Size([1, 1, 16391, 16391]) Position ids shape: torch.Size([1, 16391]) Input IDs shape: torch.Size([1, 16391]) Labels shape: torch.Size([1, 16391]) Final batch size: 1, sequence length: 18341 Attention mask shape: torch.Size([1, 1, 18341, 18341]) Position ids shape: torch.Size([1, 18341]) Input IDs shape: torch.Size([1, 18341]) Labels shape: torch.Size([1, 18341]) Final batch size: 1, sequence length: 18400 Attention mask shape: torch.Size([1, 1, 18400, 18400]) Position ids shape: torch.Size([1, 18400]) Input IDs shape: torch.Size([1, 18400]) Labels shape: torch.Size([1, 18400]) Final batch size: 1, sequence length: 18014 Attention mask shape: torch.Size([1, 1, 18014, 18014]) Position ids shape: torch.Size([1, 18014]) Input IDs shape: torch.Size([1, 18014]) Labels shape: torch.Size([1, 18014]) Final batch size: 1, sequence length: 18424 Attention mask shape: torch.Size([1, 1, 18424, 18424]) Position ids shape: torch.Size([1, 18424]) Input IDs shape: torch.Size([1, 18424]) Labels shape: torch.Size([1, 18424]) Final batch size: 1, sequence length: 19332 Attention mask shape: torch.Size([1, 1, 19332, 19332]) Position ids shape: torch.Size([1, 19332]) Input IDs shape: torch.Size([1, 19332]) Labels shape: torch.Size([1, 19332]) Final batch size: 1, sequence length: 16831 Attention mask shape: torch.Size([1, 1, 16831, 16831]) Position ids shape: torch.Size([1, 16831]) Input IDs shape: torch.Size([1, 16831]) Labels shape: torch.Size([1, 16831]) Final batch size: 1, sequence length: 16326 Attention mask shape: torch.Size([1, 1, 16326, 16326]) Position ids shape: torch.Size([1, 16326]) Input IDs shape: torch.Size([1, 16326]) Labels shape: torch.Size([1, 16326]) Final batch size: 1, sequence length: 19597 Attention mask shape: torch.Size([1, 1, 19597, 19597]) Position ids shape: torch.Size([1, 19597]) Input IDs shape: torch.Size([1, 19597]) Labels shape: torch.Size([1, 19597]) Final batch size: 1, sequence length: 16536 Attention mask shape: torch.Size([1, 1, 16536, 16536]) Position ids shape: torch.Size([1, 16536]) Input IDs shape: torch.Size([1, 16536]) Labels shape: torch.Size([1, 16536]) Final batch size: 1, sequence length: 21617 Attention mask shape: torch.Size([1, 1, 21617, 21617]) Position ids shape: torch.Size([1, 21617]) Input IDs shape: torch.Size([1, 21617]) Labels shape: torch.Size([1, 21617]) Final batch size: 1, sequence length: 20118 Attention mask shape: torch.Size([1, 1, 20118, 20118]) Position ids shape: torch.Size([1, 20118]) Input IDs shape: torch.Size([1, 20118]) Labels shape: torch.Size([1, 20118]) Final batch size: 1, sequence length: 21122 Attention mask shape: torch.Size([1, 1, 21122, 21122]) Position ids shape: torch.Size([1, 21122]) Input IDs shape: torch.Size([1, 21122]) Labels shape: torch.Size([1, 21122]) Final batch size: 1, sequence length: 18393 Attention mask shape: torch.Size([1, 1, 18393, 18393]) Position ids shape: torch.Size([1, 18393]) Input IDs shape: torch.Size([1, 18393]) Labels shape: torch.Size([1, 18393]) Final batch size: 1, sequence length: 19999 Attention mask shape: torch.Size([1, 1, 19999, 19999]) Position ids shape: torch.Size([1, 19999]) Input IDs shape: torch.Size([1, 19999]) Labels shape: torch.Size([1, 19999]) Final batch size: 1, sequence length: 13638 Attention mask shape: torch.Size([1, 1, 13638, 13638]) Position ids shape: torch.Size([1, 13638]) Input IDs shape: torch.Size([1, 13638]) Labels shape: torch.Size([1, 13638]) Final batch size: 1, sequence length: 20888 Attention mask shape: torch.Size([1, 1, 20888, 20888]) Position ids shape: torch.Size([1, 20888]) Input IDs shape: torch.Size([1, 20888]) Labels shape: torch.Size([1, 20888]) Final batch size: 1, sequence length: 20912 Attention mask shape: torch.Size([1, 1, 20912, 20912]) Position ids shape: torch.Size([1, 20912]) Input IDs shape: torch.Size([1, 20912]) Labels shape: torch.Size([1, 20912]) Final batch size: 1, sequence length: 22854 Attention mask shape: torch.Size([1, 1, 22854, 22854]) Position ids shape: torch.Size([1, 22854]) Input IDs shape: torch.Size([1, 22854]) Labels shape: torch.Size([1, 22854]) Final batch size: 1, sequence length: 21314 Attention mask shape: torch.Size([1, 1, 21314, 21314]) Position ids shape: torch.Size([1, 21314]) Input IDs shape: torch.Size([1, 21314]) Labels shape: torch.Size([1, 21314]) Final batch size: 1, sequence length: 24597 Attention mask shape: torch.Size([1, 1, 24597, 24597]) Position ids shape: torch.Size([1, 24597]) Input IDs shape: torch.Size([1, 24597]) Labels shape: torch.Size([1, 24597]) Final batch size: 1, sequence length: 21739 Attention mask shape: torch.Size([1, 1, 21739, 21739]) Position ids shape: torch.Size([1, 21739]) Input IDs shape: torch.Size([1, 21739]) Labels shape: torch.Size([1, 21739]) Final batch size: 1, sequence length: 25909 Attention mask shape: torch.Size([1, 1, 25909, 25909]) Position ids shape: torch.Size([1, 25909]) Input IDs shape: torch.Size([1, 25909]) Labels shape: torch.Size([1, 25909]) Final batch size: 1, sequence length: 6839 Attention mask shape: torch.Size([1, 1, 6839, 6839]) Position ids shape: torch.Size([1, 6839]) Input IDs shape: torch.Size([1, 6839]) Labels shape: torch.Size([1, 6839]) Final batch size: 1, sequence length: 9341 Attention mask shape: torch.Size([1, 1, 9341, 9341]) Position ids shape: torch.Size([1, 9341]) Input IDs shape: torch.Size([1, 9341]) Labels shape: torch.Size([1, 9341]) Final batch size: 1, sequence length: 12328 Attention mask shape: torch.Size([1, 1, 12328, 12328]) Position ids shape: torch.Size([1, 12328]) Input IDs shape: torch.Size([1, 12328]) Labels shape: torch.Size([1, 12328]) Final batch size: 1, sequence length: 25575 Attention mask shape: torch.Size([1, 1, 25575, 25575]) Position ids shape: torch.Size([1, 25575]) Input IDs shape: torch.Size([1, 25575]) Labels shape: torch.Size([1, 25575]) Final batch size: 1, sequence length: 21611 Attention mask shape: torch.Size([1, 1, 21611, 21611]) Position ids shape: torch.Size([1, 21611]) Input IDs shape: torch.Size([1, 21611]) Labels shape: torch.Size([1, 21611]) Final batch size: 1, sequence length: 21725 Attention mask shape: torch.Size([1, 1, 21725, 21725]) Position ids shape: torch.Size([1, 21725]) Input IDs shape: torch.Size([1, 21725]) Labels shape: torch.Size([1, 21725]) Final batch size: 1, sequence length: 25042 Attention mask shape: torch.Size([1, 1, 25042, 25042]) Position ids shape: torch.Size([1, 25042]) Input IDs shape: torch.Size([1, 25042]) Labels shape: torch.Size([1, 25042]) Final batch size: 1, sequence length: 24090 Attention mask shape: torch.Size([1, 1, 24090, 24090]) Position ids shape: torch.Size([1, 24090]) Input IDs shape: torch.Size([1, 24090]) Labels shape: torch.Size([1, 24090]) Final batch size: 1, sequence length: 19885 Attention mask shape: torch.Size([1, 1, 19885, 19885]) Position ids shape: torch.Size([1, 19885]) Input IDs shape: torch.Size([1, 19885]) Labels shape: torch.Size([1, 19885]) Final batch size: 1, sequence length: 21677 Attention mask shape: torch.Size([1, 1, 21677, 21677]) Position ids shape: torch.Size([1, 21677]) Input IDs shape: torch.Size([1, 21677]) Labels shape: torch.Size([1, 21677]) Final batch size: 1, sequence length: 28348 Attention mask shape: torch.Size([1, 1, 28348, 28348]) Position ids shape: torch.Size([1, 28348]) Input IDs shape: torch.Size([1, 28348]) Labels shape: torch.Size([1, 28348]) Final batch size: 1, sequence length: 24342 Attention mask shape: torch.Size([1, 1, 24342, 24342]) Position ids shape: torch.Size([1, 24342]) Input IDs shape: torch.Size([1, 24342]) Labels shape: torch.Size([1, 24342]) Final batch size: 1, sequence length: 15232 Attention mask shape: torch.Size([1, 1, 15232, 15232]) Position ids shape: torch.Size([1, 15232]) Input IDs shape: torch.Size([1, 15232]) Labels shape: torch.Size([1, 15232]) Final batch size: 1, sequence length: 20619 Attention mask shape: torch.Size([1, 1, 20619, 20619]) Position ids shape: torch.Size([1, 20619]) Input IDs shape: torch.Size([1, 20619]) Labels shape: torch.Size([1, 20619]) Final batch size: 1, sequence length: 15026 Attention mask shape: torch.Size([1, 1, 15026, 15026]) Position ids shape: torch.Size([1, 15026]) Input IDs shape: torch.Size([1, 15026]) Labels shape: torch.Size([1, 15026]) Final batch size: 1, sequence length: 27795 Attention mask shape: torch.Size([1, 1, 27795, 27795]) Position ids shape: torch.Size([1, 27795]) Input IDs shape: torch.Size([1, 27795]) Labels shape: torch.Size([1, 27795]) Final batch size: 1, sequence length: 16611 Attention mask shape: torch.Size([1, 1, 16611, 16611]) Position ids shape: torch.Size([1, 16611]) Input IDs shape: torch.Size([1, 16611]) Labels shape: torch.Size([1, 16611]) Final batch size: 1, sequence length: 30366 Attention mask shape: torch.Size([1, 1, 30366, 30366]) Position ids shape: torch.Size([1, 30366]) Input IDs shape: torch.Size([1, 30366]) Labels shape: torch.Size([1, 30366]) Final batch size: 1, sequence length: 26218 Attention mask shape: torch.Size([1, 1, 26218, 26218]) Position ids shape: torch.Size([1, 26218]) Input IDs shape: torch.Size([1, 26218]) Labels shape: torch.Size([1, 26218]) Final batch size: 1, sequence length: 17780 Attention mask shape: torch.Size([1, 1, 17780, 17780]) Position ids shape: torch.Size([1, 17780]) Input IDs shape: torch.Size([1, 17780]) Labels shape: torch.Size([1, 17780]) Final batch size: 1, sequence length: 13646 Attention mask shape: torch.Size([1, 1, 13646, 13646]) Position ids shape: torch.Size([1, 13646]) Input IDs shape: torch.Size([1, 13646]) Labels shape: torch.Size([1, 13646]) Final batch size: 1, sequence length: 23614 Attention mask shape: torch.Size([1, 1, 23614, 23614]) Position ids shape: torch.Size([1, 23614]) Input IDs shape: torch.Size([1, 23614]) Labels shape: torch.Size([1, 23614]) Final batch size: 1, sequence length: 28678 Attention mask shape: torch.Size([1, 1, 28678, 28678]) Position ids shape: torch.Size([1, 28678]) Input IDs shape: torch.Size([1, 28678]) Labels shape: torch.Size([1, 28678]) Final batch size: 1, sequence length: 20702 Attention mask shape: torch.Size([1, 1, 20702, 20702]) Position ids shape: torch.Size([1, 20702]) Input IDs shape: torch.Size([1, 20702]) Labels shape: torch.Size([1, 20702]) Final batch size: 1, sequence length: 21623 Attention mask shape: torch.Size([1, 1, 21623, 21623]) Position ids shape: torch.Size([1, 21623]) Input IDs shape: torch.Size([1, 21623]) Labels shape: torch.Size([1, 21623]) Final batch size: 1, sequence length: 22217 Attention mask shape: torch.Size([1, 1, 22217, 22217]) Position ids shape: torch.Size([1, 22217]) Input IDs shape: torch.Size([1, 22217]) Labels shape: torch.Size([1, 22217]) Final batch size: 1, sequence length: 30410 Attention mask shape: torch.Size([1, 1, 30410, 30410]) Position ids shape: torch.Size([1, 30410]) Input IDs shape: torch.Size([1, 30410]) Labels shape: torch.Size([1, 30410]) Final batch size: 1, sequence length: 29730 Attention mask shape: torch.Size([1, 1, 29730, 29730]) Position ids shape: torch.Size([1, 29730]) Input IDs shape: torch.Size([1, 29730]) Labels shape: torch.Size([1, 29730]) Final batch size: 1, sequence length: 30766 Attention mask shape: torch.Size([1, 1, 30766, 30766]) Position ids shape: torch.Size([1, 30766]) Input IDs shape: torch.Size([1, 30766]) Labels shape: torch.Size([1, 30766]) Final batch size: 1, sequence length: 32034 Attention mask shape: torch.Size([1, 1, 32034, 32034]) Position ids shape: torch.Size([1, 32034]) Input IDs shape: torch.Size([1, 32034]) Labels shape: torch.Size([1, 32034]) Final batch size: 1, sequence length: 20915 Attention mask shape: torch.Size([1, 1, 20915, 20915]) Position ids shape: torch.Size([1, 20915]) Input IDs shape: torch.Size([1, 20915]) Labels shape: torch.Size([1, 20915]) Final batch size: 1, sequence length: 30917 Attention mask shape: torch.Size([1, 1, 30917, 30917]) Position ids shape: torch.Size([1, 30917]) Input IDs shape: torch.Size([1, 30917]) Labels shape: torch.Size([1, 30917]) Final batch size: 1, sequence length: 16403 Attention mask shape: torch.Size([1, 1, 16403, 16403]) Position ids shape: torch.Size([1, 16403]) Input IDs shape: torch.Size([1, 16403]) Labels shape: torch.Size([1, 16403]) Final batch size: 1, sequence length: 30796 Attention mask shape: torch.Size([1, 1, 30796, 30796]) Position ids shape: torch.Size([1, 30796]) Input IDs shape: torch.Size([1, 30796]) Labels shape: torch.Size([1, 30796]) Final batch size: 1, sequence length: 25698 Attention mask shape: torch.Size([1, 1, 25698, 25698]) Position ids shape: torch.Size([1, 25698]) Input IDs shape: torch.Size([1, 25698]) Labels shape: torch.Size([1, 25698]) Final batch size: 1, sequence length: 32308 Attention mask shape: torch.Size([1, 1, 32308, 32308]) Position ids shape: torch.Size([1, 32308]) Input IDs shape: torch.Size([1, 32308]) Labels shape: torch.Size([1, 32308]) Final batch size: 1, sequence length: 20028 Attention mask shape: torch.Size([1, 1, 20028, 20028]) Position ids shape: torch.Size([1, 20028]) Input IDs shape: torch.Size([1, 20028]) Labels shape: torch.Size([1, 20028]) Final batch size: 1, sequence length: 35781 Attention mask shape: torch.Size([1, 1, 35781, 35781]) Position ids shape: torch.Size([1, 35781]) Input IDs shape: torch.Size([1, 35781]) Labels shape: torch.Size([1, 35781]) Final batch size: 1, sequence length: 35628 Attention mask shape: torch.Size([1, 1, 35628, 35628]) Position ids shape: torch.Size([1, 35628]) Input IDs shape: torch.Size([1, 35628]) Labels shape: torch.Size([1, 35628]) Final batch size: 1, sequence length: 14070 Attention mask shape: torch.Size([1, 1, 14070, 14070]) Position ids shape: torch.Size([1, 14070]) Input IDs shape: torch.Size([1, 14070]) Labels shape: torch.Size([1, 14070]) Final batch size: 1, sequence length: 21562 Attention mask shape: torch.Size([1, 1, 21562, 21562]) Position ids shape: torch.Size([1, 21562]) Input IDs shape: torch.Size([1, 21562]) Labels shape: torch.Size([1, 21562]) Final batch size: 1, sequence length: 32630 Attention mask shape: torch.Size([1, 1, 32630, 32630]) Position ids shape: torch.Size([1, 32630]) Input IDs shape: torch.Size([1, 32630]) Labels shape: torch.Size([1, 32630]) Final batch size: 1, sequence length: 38362 Attention mask shape: torch.Size([1, 1, 38362, 38362]) Position ids shape: torch.Size([1, 38362]) Input IDs shape: torch.Size([1, 38362]) Labels shape: torch.Size([1, 38362]) Final batch size: 1, sequence length: 32239 Attention mask shape: torch.Size([1, 1, 32239, 32239]) Position ids shape: torch.Size([1, 32239]) Input IDs shape: torch.Size([1, 32239]) Labels shape: torch.Size([1, 32239]) Final batch size: 1, sequence length: 15726 Attention mask shape: torch.Size([1, 1, 15726, 15726]) Position ids shape: torch.Size([1, 15726]) Input IDs shape: torch.Size([1, 15726]) Labels shape: torch.Size([1, 15726]) Final batch size: 1, sequence length: 37195 Attention mask shape: torch.Size([1, 1, 37195, 37195]) Position ids shape: torch.Size([1, 37195]) Input IDs shape: torch.Size([1, 37195]) Labels shape: torch.Size([1, 37195]) Final batch size: 1, sequence length: 35697 Attention mask shape: torch.Size([1, 1, 35697, 35697]) Position ids shape: torch.Size([1, 35697]) Input IDs shape: torch.Size([1, 35697]) Labels shape: torch.Size([1, 35697]) Final batch size: 1, sequence length: 34802 Attention mask shape: torch.Size([1, 1, 34802, 34802]) Position ids shape: torch.Size([1, 34802]) Input IDs shape: torch.Size([1, 34802]) Labels shape: torch.Size([1, 34802]) Final batch size: 1, sequence length: 29202 Attention mask shape: torch.Size([1, 1, 29202, 29202]) Position ids shape: torch.Size([1, 29202]) Input IDs shape: torch.Size([1, 29202]) Labels shape: torch.Size([1, 29202]) Final batch size: 1, sequence length: 38420 Attention mask shape: torch.Size([1, 1, 38420, 38420]) Position ids shape: torch.Size([1, 38420]) Input IDs shape: torch.Size([1, 38420]) Labels shape: torch.Size([1, 38420]) Final batch size: 1, sequence length: 39608 Attention mask shape: torch.Size([1, 1, 39608, 39608]) Position ids shape: torch.Size([1, 39608]) Input IDs shape: torch.Size([1, 39608]) Labels shape: torch.Size([1, 39608]) Final batch size: 1, sequence length: 23626 Attention mask shape: torch.Size([1, 1, 23626, 23626]) Position ids shape: torch.Size([1, 23626]) Input IDs shape: torch.Size([1, 23626]) Labels shape: torch.Size([1, 23626]) Final batch size: 1, sequence length: 11896 Attention mask shape: torch.Size([1, 1, 11896, 11896]) Position ids shape: torch.Size([1, 11896]) Input IDs shape: torch.Size([1, 11896]) Labels shape: torch.Size([1, 11896]) Final batch size: 1, sequence length: 38190 Attention mask shape: torch.Size([1, 1, 38190, 38190]) Position ids shape: torch.Size([1, 38190]) Input IDs shape: torch.Size([1, 38190]) Labels shape: torch.Size([1, 38190]) Final batch size: 1, sequence length: 22199 Attention mask shape: torch.Size([1, 1, 22199, 22199]) Position ids shape: torch.Size([1, 22199]) Input IDs shape: torch.Size([1, 22199]) Labels shape: torch.Size([1, 22199]) Final batch size: 1, sequence length: 14828 Attention mask shape: torch.Size([1, 1, 14828, 14828]) Position ids shape: torch.Size([1, 14828]) Input IDs shape: torch.Size([1, 14828]) Labels shape: torch.Size([1, 14828]) Final batch size: 1, sequence length: 22768 Attention mask shape: torch.Size([1, 1, 22768, 22768]) Position ids shape: torch.Size([1, 22768]) Input IDs shape: torch.Size([1, 22768]) Labels shape: torch.Size([1, 22768]) Final batch size: 1, sequence length: 26287 Attention mask shape: torch.Size([1, 1, 26287, 26287]) Position ids shape: torch.Size([1, 26287]) Input IDs shape: torch.Size([1, 26287]) Labels shape: torch.Size([1, 26287]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 18568 Attention mask shape: torch.Size([1, 1, 18568, 18568]) Position ids shape: torch.Size([1, 18568]) Input IDs shape: torch.Size([1, 18568]) Labels shape: torch.Size([1, 18568]) Final batch size: 1, sequence length: 26840 Attention mask shape: torch.Size([1, 1, 26840, 26840]) Position ids shape: torch.Size([1, 26840]) Input IDs shape: torch.Size([1, 26840]) Labels shape: torch.Size([1, 26840]) Final batch size: 1, sequence length: 10426 Attention mask shape: torch.Size([1, 1, 10426, 10426]) Position ids shape: torch.Size([1, 10426]) Input IDs shape: torch.Size([1, 10426]) Labels shape: torch.Size([1, 10426]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 31143 Attention mask shape: torch.Size([1, 1, 31143, 31143]) Position ids shape: torch.Size([1, 31143]) Input IDs shape: torch.Size([1, 31143]) Labels shape: torch.Size([1, 31143]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 30651 Attention mask shape: torch.Size([1, 1, 30651, 30651]) Position ids shape: torch.Size([1, 30651]) Input IDs shape: torch.Size([1, 30651]) Labels shape: torch.Size([1, 30651]) Final batch size: 1, sequence length: 16304 Attention mask shape: torch.Size([1, 1, 16304, 16304]) Position ids shape: torch.Size([1, 16304]) Input IDs shape: torch.Size([1, 16304]) Labels shape: torch.Size([1, 16304]) Final batch size: 1, sequence length: 35914 Attention mask shape: torch.Size([1, 1, 35914, 35914]) Position ids shape: torch.Size([1, 35914]) Input IDs shape: torch.Size([1, 35914]) Labels shape: torch.Size([1, 35914]) Final batch size: 1, sequence length: 40088 Attention mask shape: torch.Size([1, 1, 40088, 40088]) Position ids shape: torch.Size([1, 40088]) Input IDs shape: torch.Size([1, 40088]) Labels shape: torch.Size([1, 40088]) Final batch size: 1, sequence length: 37529 Attention mask shape: torch.Size([1, 1, 37529, 37529]) Position ids shape: torch.Size([1, 37529]) Input IDs shape: torch.Size([1, 37529]) Labels shape: torch.Size([1, 37529]) Final batch size: 1, sequence length: 34785 Attention mask shape: torch.Size([1, 1, 34785, 34785]) Position ids shape: torch.Size([1, 34785]) Input IDs shape: torch.Size([1, 34785]) Labels shape: torch.Size([1, 34785]) Final batch size: 1, sequence length: 27819 Attention mask shape: torch.Size([1, 1, 27819, 27819]) Position ids shape: torch.Size([1, 27819]) Input IDs shape: torch.Size([1, 27819]) Labels shape: torch.Size([1, 27819]) Final batch size: 1, sequence length: 32559 Attention mask shape: torch.Size([1, 1, 32559, 32559]) Position ids shape: torch.Size([1, 32559]) Input IDs shape: torch.Size([1, 32559]) Labels shape: torch.Size([1, 32559]) Final batch size: 1, sequence length: 18859 Attention mask shape: torch.Size([1, 1, 18859, 18859]) Position ids shape: torch.Size([1, 18859]) Input IDs shape: torch.Size([1, 18859]) Labels shape: torch.Size([1, 18859]) Final batch size: 1, sequence length: 10198 Attention mask shape: torch.Size([1, 1, 10198, 10198]) Position ids shape: torch.Size([1, 10198]) Input IDs shape: torch.Size([1, 10198]) Labels shape: torch.Size([1, 10198]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 36638 Attention mask shape: torch.Size([1, 1, 36638, 36638]) Position ids shape: torch.Size([1, 36638]) Input IDs shape: torch.Size([1, 36638]) Labels shape: torch.Size([1, 36638]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 19494 Attention mask shape: torch.Size([1, 1, 19494, 19494]) Position ids shape: torch.Size([1, 19494]) Input IDs shape: torch.Size([1, 19494]) Labels shape: torch.Size([1, 19494]) Final batch size: 1, sequence length: 31754 Attention mask shape: torch.Size([1, 1, 31754, 31754]) Position ids shape: torch.Size([1, 31754]) Input IDs shape: torch.Size([1, 31754]) Labels shape: torch.Size([1, 31754]) Final batch size: 1, sequence length: 40763 Attention mask shape: torch.Size([1, 1, 40763, 40763]) Position ids shape: torch.Size([1, 40763]) Input IDs shape: torch.Size([1, 40763]) Labels shape: torch.Size([1, 40763]) Final batch size: 1, sequence length: 30835 Attention mask shape: torch.Size([1, 1, 30835, 30835]) Position ids shape: torch.Size([1, 30835]) Input IDs shape: torch.Size([1, 30835]) Labels shape: torch.Size([1, 30835]) Final batch size: 1, sequence length: 31316 Attention mask shape: torch.Size([1, 1, 31316, 31316]) Position ids shape: torch.Size([1, 31316]) Input IDs shape: torch.Size([1, 31316]) Labels shape: torch.Size([1, 31316]) Final batch size: 1, sequence length: 23936 Attention mask shape: torch.Size([1, 1, 23936, 23936]) Position ids shape: torch.Size([1, 23936]) Input IDs shape: torch.Size([1, 23936]) Labels shape: torch.Size([1, 23936]) Final batch size: 1, sequence length: 12605 Attention mask shape: torch.Size([1, 1, 12605, 12605]) Position ids shape: torch.Size([1, 12605]) Input IDs shape: torch.Size([1, 12605]) Labels shape: torch.Size([1, 12605]) Final batch size: 1, sequence length: 26269 Attention mask shape: torch.Size([1, 1, 26269, 26269]) Position ids shape: torch.Size([1, 26269]) Input IDs shape: torch.Size([1, 26269]) Labels shape: torch.Size([1, 26269]) Final batch size: 1, sequence length: 25573 Attention mask shape: torch.Size([1, 1, 25573, 25573]) Position ids shape: torch.Size([1, 25573]) Input IDs shape: torch.Size([1, 25573]) Labels shape: torch.Size([1, 25573]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 37170 Attention mask shape: torch.Size([1, 1, 37170, 37170]) Position ids shape: torch.Size([1, 37170]) Input IDs shape: torch.Size([1, 37170]) Labels shape: torch.Size([1, 37170]) {'loss': 0.318, 'grad_norm': 0.43370865129632213, 'learning_rate': 8.885729807284855e-06, 'num_tokens': -inf, 'epoch': 2.25} Final batch size: 1, sequence length: 7998 Attention mask shape: torch.Size([1, 1, 7998, 7998]) Position ids shape: torch.Size([1, 7998]) Input IDs shape: torch.Size([1, 7998]) Labels shape: torch.Size([1, 7998]) Final batch size: 1, sequence length: 6925 Attention mask shape: torch.Size([1, 1, 6925, 6925]) Position ids shape: torch.Size([1, 6925]) Input IDs shape: torch.Size([1, 6925]) Labels shape: torch.Size([1, 6925]) Final batch size: 1, sequence length: 7402 Attention mask shape: torch.Size([1, 1, 7402, 7402]) Position ids shape: torch.Size([1, 7402]) Input IDs shape: torch.Size([1, 7402]) Labels shape: torch.Size([1, 7402]) Final batch size: 1, sequence length: 10102 Attention mask shape: torch.Size([1, 1, 10102, 10102]) Position ids shape: torch.Size([1, 10102]) Input IDs shape: torch.Size([1, 10102]) Labels shape: torch.Size([1, 10102]) Final batch size: 1, sequence length: 9243 Attention mask shape: torch.Size([1, 1, 9243, 9243]) Position ids shape: torch.Size([1, 9243]) Input IDs shape: torch.Size([1, 9243]) Labels shape: torch.Size([1, 9243]) Final batch size: 1, sequence length: 10804 Attention mask shape: torch.Size([1, 1, 10804, 10804]) Position ids shape: torch.Size([1, 10804]) Input IDs shape: torch.Size([1, 10804]) Labels shape: torch.Size([1, 10804]) Final batch size: 1, sequence length: 10862 Attention mask shape: torch.Size([1, 1, 10862, 10862]) Position ids shape: torch.Size([1, 10862]) Input IDs shape: torch.Size([1, 10862]) Labels shape: torch.Size([1, 10862]) Final batch size: 1, sequence length: 11947 Attention mask shape: torch.Size([1, 1, 11947, 11947]) Position ids shape: torch.Size([1, 11947]) Input IDs shape: torch.Size([1, 11947]) Labels shape: torch.Size([1, 11947]) Final batch size: 1, sequence length: 8432 Attention mask shape: torch.Size([1, 1, 8432, 8432]) Position ids shape: torch.Size([1, 8432]) Input IDs shape: torch.Size([1, 8432]) Labels shape: torch.Size([1, 8432]) Final batch size: 1, sequence length: 9452 Attention mask shape: torch.Size([1, 1, 9452, 9452]) Position ids shape: torch.Size([1, 9452]) Input IDs shape: torch.Size([1, 9452]) Labels shape: torch.Size([1, 9452]) Final batch size: 1, sequence length: 11623 Attention mask shape: torch.Size([1, 1, 11623, 11623]) Position ids shape: torch.Size([1, 11623]) Input IDs shape: torch.Size([1, 11623]) Labels shape: torch.Size([1, 11623]) Final batch size: 1, sequence length: 12719 Attention mask shape: torch.Size([1, 1, 12719, 12719]) Position ids shape: torch.Size([1, 12719]) Input IDs shape: torch.Size([1, 12719]) Labels shape: torch.Size([1, 12719]) Final batch size: 1, sequence length: 13209 Attention mask shape: torch.Size([1, 1, 13209, 13209]) Position ids shape: torch.Size([1, 13209]) Input IDs shape: torch.Size([1, 13209]) Labels shape: torch.Size([1, 13209]) Final batch size: 1, sequence length: 16060 Attention mask shape: torch.Size([1, 1, 16060, 16060]) Position ids shape: torch.Size([1, 16060]) Input IDs shape: torch.Size([1, 16060]) Labels shape: torch.Size([1, 16060]) Final batch size: 1, sequence length: 17370 Attention mask shape: torch.Size([1, 1, 17370, 17370]) Position ids shape: torch.Size([1, 17370]) Input IDs shape: torch.Size([1, 17370]) Labels shape: torch.Size([1, 17370]) Final batch size: 1, sequence length: 15053 Attention mask shape: torch.Size([1, 1, 15053, 15053]) Position ids shape: torch.Size([1, 15053]) Input IDs shape: torch.Size([1, 15053]) Labels shape: torch.Size([1, 15053]) Final batch size: 1, sequence length: 17376 Attention mask shape: torch.Size([1, 1, 17376, 17376]) Position ids shape: torch.Size([1, 17376]) Input IDs shape: torch.Size([1, 17376]) Labels shape: torch.Size([1, 17376]) Final batch size: 1, sequence length: 17918 Attention mask shape: torch.Size([1, 1, 17918, 17918]) Position ids shape: torch.Size([1, 17918]) Input IDs shape: torch.Size([1, 17918]) Labels shape: torch.Size([1, 17918]) Final batch size: 1, sequence length: 16536 Attention mask shape: torch.Size([1, 1, 16536, 16536]) Position ids shape: torch.Size([1, 16536]) Input IDs shape: torch.Size([1, 16536]) Labels shape: torch.Size([1, 16536]) Final batch size: 1, sequence length: 15886 Attention mask shape: torch.Size([1, 1, 15886, 15886]) Position ids shape: torch.Size([1, 15886]) Input IDs shape: torch.Size([1, 15886]) Labels shape: torch.Size([1, 15886]) Final batch size: 1, sequence length: 17665 Attention mask shape: torch.Size([1, 1, 17665, 17665]) Position ids shape: torch.Size([1, 17665]) Input IDs shape: torch.Size([1, 17665]) Labels shape: torch.Size([1, 17665]) Final batch size: 1, sequence length: 15794 Attention mask shape: torch.Size([1, 1, 15794, 15794]) Position ids shape: torch.Size([1, 15794]) Input IDs shape: torch.Size([1, 15794]) Labels shape: torch.Size([1, 15794]) Final batch size: 1, sequence length: 20185 Attention mask shape: torch.Size([1, 1, 20185, 20185]) Position ids shape: torch.Size([1, 20185]) Input IDs shape: torch.Size([1, 20185]) Labels shape: torch.Size([1, 20185]) Final batch size: 1, sequence length: 17767 Attention mask shape: torch.Size([1, 1, 17767, 17767]) Position ids shape: torch.Size([1, 17767]) Input IDs shape: torch.Size([1, 17767]) Labels shape: torch.Size([1, 17767]) Final batch size: 1, sequence length: 20056 Attention mask shape: torch.Size([1, 1, 20056, 20056]) Position ids shape: torch.Size([1, 20056]) Input IDs shape: torch.Size([1, 20056]) Labels shape: torch.Size([1, 20056]) Final batch size: 1, sequence length: 21117 Attention mask shape: torch.Size([1, 1, 21117, 21117]) Position ids shape: torch.Size([1, 21117]) Input IDs shape: torch.Size([1, 21117]) Labels shape: torch.Size([1, 21117]) Final batch size: 1, sequence length: 17625 Attention mask shape: torch.Size([1, 1, 17625, 17625]) Position ids shape: torch.Size([1, 17625]) Input IDs shape: torch.Size([1, 17625]) Labels shape: torch.Size([1, 17625]) Final batch size: 1, sequence length: 19629 Attention mask shape: torch.Size([1, 1, 19629, 19629]) Position ids shape: torch.Size([1, 19629]) Input IDs shape: torch.Size([1, 19629]) Labels shape: torch.Size([1, 19629]) Final batch size: 1, sequence length: 19609 Attention mask shape: torch.Size([1, 1, 19609, 19609]) Position ids shape: torch.Size([1, 19609]) Input IDs shape: torch.Size([1, 19609]) Labels shape: torch.Size([1, 19609]) Final batch size: 1, sequence length: 21771 Attention mask shape: torch.Size([1, 1, 21771, 21771]) Position ids shape: torch.Size([1, 21771]) Input IDs shape: torch.Size([1, 21771]) Labels shape: torch.Size([1, 21771]) Final batch size: 1, sequence length: 19935 Attention mask shape: torch.Size([1, 1, 19935, 19935]) Position ids shape: torch.Size([1, 19935]) Input IDs shape: torch.Size([1, 19935]) Labels shape: torch.Size([1, 19935]) Final batch size: 1, sequence length: 21980 Attention mask shape: torch.Size([1, 1, 21980, 21980]) Position ids shape: torch.Size([1, 21980]) Input IDs shape: torch.Size([1, 21980]) Labels shape: torch.Size([1, 21980]) Final batch size: 1, sequence length: 12006 Attention mask shape: torch.Size([1, 1, 12006, 12006]) Position ids shape: torch.Size([1, 12006]) Input IDs shape: torch.Size([1, 12006]) Labels shape: torch.Size([1, 12006]) Final batch size: 1, sequence length: 21421 Attention mask shape: torch.Size([1, 1, 21421, 21421]) Position ids shape: torch.Size([1, 21421]) Input IDs shape: torch.Size([1, 21421]) Labels shape: torch.Size([1, 21421]) Final batch size: 1, sequence length: 8498 Attention mask shape: torch.Size([1, 1, 8498, 8498]) Position ids shape: torch.Size([1, 8498]) Input IDs shape: torch.Size([1, 8498]) Labels shape: torch.Size([1, 8498]) Final batch size: 1, sequence length: 22208 Attention mask shape: torch.Size([1, 1, 22208, 22208]) Position ids shape: torch.Size([1, 22208]) Input IDs shape: torch.Size([1, 22208]) Labels shape: torch.Size([1, 22208]) Final batch size: 1, sequence length: 20554 Attention mask shape: torch.Size([1, 1, 20554, 20554]) Position ids shape: torch.Size([1, 20554]) Input IDs shape: torch.Size([1, 20554]) Labels shape: torch.Size([1, 20554]) Final batch size: 1, sequence length: 16141 Attention mask shape: torch.Size([1, 1, 16141, 16141]) Position ids shape: torch.Size([1, 16141]) Input IDs shape: torch.Size([1, 16141]) Labels shape: torch.Size([1, 16141]) Final batch size: 1, sequence length: 22683 Attention mask shape: torch.Size([1, 1, 22683, 22683]) Position ids shape: torch.Size([1, 22683]) Input IDs shape: torch.Size([1, 22683]) Labels shape: torch.Size([1, 22683]) Final batch size: 1, sequence length: 20524 Attention mask shape: torch.Size([1, 1, 20524, 20524]) Position ids shape: torch.Size([1, 20524]) Input IDs shape: torch.Size([1, 20524]) Labels shape: torch.Size([1, 20524]) Final batch size: 1, sequence length: 22797 Attention mask shape: torch.Size([1, 1, 22797, 22797]) Position ids shape: torch.Size([1, 22797]) Input IDs shape: torch.Size([1, 22797]) Labels shape: torch.Size([1, 22797]) Final batch size: 1, sequence length: 20816 Attention mask shape: torch.Size([1, 1, 20816, 20816]) Position ids shape: torch.Size([1, 20816]) Input IDs shape: torch.Size([1, 20816]) Labels shape: torch.Size([1, 20816]) Final batch size: 1, sequence length: 24308 Attention mask shape: torch.Size([1, 1, 24308, 24308]) Position ids shape: torch.Size([1, 24308]) Input IDs shape: torch.Size([1, 24308]) Labels shape: torch.Size([1, 24308]) Final batch size: 1, sequence length: 27179 Attention mask shape: torch.Size([1, 1, 27179, 27179]) Position ids shape: torch.Size([1, 27179]) Input IDs shape: torch.Size([1, 27179]) Labels shape: torch.Size([1, 27179]) Final batch size: 1, sequence length: 24515 Attention mask shape: torch.Size([1, 1, 24515, 24515]) Position ids shape: torch.Size([1, 24515]) Input IDs shape: torch.Size([1, 24515]) Labels shape: torch.Size([1, 24515]) Final batch size: 1, sequence length: 26449 Attention mask shape: torch.Size([1, 1, 26449, 26449]) Position ids shape: torch.Size([1, 26449]) Input IDs shape: torch.Size([1, 26449]) Labels shape: torch.Size([1, 26449]) Final batch size: 1, sequence length: 16439 Attention mask shape: torch.Size([1, 1, 16439, 16439]) Position ids shape: torch.Size([1, 16439]) Input IDs shape: torch.Size([1, 16439]) Labels shape: torch.Size([1, 16439]) Final batch size: 1, sequence length: 18991 Attention mask shape: torch.Size([1, 1, 18991, 18991]) Position ids shape: torch.Size([1, 18991]) Input IDs shape: torch.Size([1, 18991]) Labels shape: torch.Size([1, 18991]) Final batch size: 1, sequence length: 26121 Attention mask shape: torch.Size([1, 1, 26121, 26121]) Position ids shape: torch.Size([1, 26121]) Input IDs shape: torch.Size([1, 26121]) Labels shape: torch.Size([1, 26121]) Final batch size: 1, sequence length: 10857 Attention mask shape: torch.Size([1, 1, 10857, 10857]) Position ids shape: torch.Size([1, 10857]) Input IDs shape: torch.Size([1, 10857]) Labels shape: torch.Size([1, 10857]) Final batch size: 1, sequence length: 26142 Attention mask shape: torch.Size([1, 1, 26142, 26142]) Position ids shape: torch.Size([1, 26142]) Input IDs shape: torch.Size([1, 26142]) Labels shape: torch.Size([1, 26142]) Final batch size: 1, sequence length: 28164 Attention mask shape: torch.Size([1, 1, 28164, 28164]) Position ids shape: torch.Size([1, 28164]) Input IDs shape: torch.Size([1, 28164]) Labels shape: torch.Size([1, 28164]) Final batch size: 1, sequence length: 24433 Attention mask shape: torch.Size([1, 1, 24433, 24433]) Position ids shape: torch.Size([1, 24433]) Input IDs shape: torch.Size([1, 24433]) Labels shape: torch.Size([1, 24433]) Final batch size: 1, sequence length: 18122 Attention mask shape: torch.Size([1, 1, 18122, 18122]) Position ids shape: torch.Size([1, 18122]) Input IDs shape: torch.Size([1, 18122]) Labels shape: torch.Size([1, 18122]) Final batch size: 1, sequence length: 21152 Attention mask shape: torch.Size([1, 1, 21152, 21152]) Position ids shape: torch.Size([1, 21152]) Input IDs shape: torch.Size([1, 21152]) Labels shape: torch.Size([1, 21152]) Final batch size: 1, sequence length: 30197 Attention mask shape: torch.Size([1, 1, 30197, 30197]) Position ids shape: torch.Size([1, 30197]) Input IDs shape: torch.Size([1, 30197]) Labels shape: torch.Size([1, 30197]) Final batch size: 1, sequence length: 16564 Attention mask shape: torch.Size([1, 1, 16564, 16564]) Position ids shape: torch.Size([1, 16564]) Input IDs shape: torch.Size([1, 16564]) Labels shape: torch.Size([1, 16564]) Final batch size: 1, sequence length: 29628 Attention mask shape: torch.Size([1, 1, 29628, 29628]) Position ids shape: torch.Size([1, 29628]) Input IDs shape: torch.Size([1, 29628]) Labels shape: torch.Size([1, 29628]) Final batch size: 1, sequence length: 24802 Attention mask shape: torch.Size([1, 1, 24802, 24802]) Position ids shape: torch.Size([1, 24802]) Input IDs shape: torch.Size([1, 24802]) Labels shape: torch.Size([1, 24802]) Final batch size: 1, sequence length: 31507 Attention mask shape: torch.Size([1, 1, 31507, 31507]) Position ids shape: torch.Size([1, 31507]) Input IDs shape: torch.Size([1, 31507]) Labels shape: torch.Size([1, 31507]) Final batch size: 1, sequence length: 17363 Attention mask shape: torch.Size([1, 1, 17363, 17363]) Position ids shape: torch.Size([1, 17363]) Input IDs shape: torch.Size([1, 17363]) Labels shape: torch.Size([1, 17363]) Final batch size: 1, sequence length: 29392 Attention mask shape: torch.Size([1, 1, 29392, 29392]) Position ids shape: torch.Size([1, 29392]) Input IDs shape: torch.Size([1, 29392]) Labels shape: torch.Size([1, 29392]) Final batch size: 1, sequence length: 31339 Attention mask shape: torch.Size([1, 1, 31339, 31339]) Position ids shape: torch.Size([1, 31339]) Input IDs shape: torch.Size([1, 31339]) Labels shape: torch.Size([1, 31339]) Final batch size: 1, sequence length: 16837 Attention mask shape: torch.Size([1, 1, 16837, 16837]) Position ids shape: torch.Size([1, 16837]) Input IDs shape: torch.Size([1, 16837]) Labels shape: torch.Size([1, 16837]) Final batch size: 1, sequence length: 32752 Attention mask shape: torch.Size([1, 1, 32752, 32752]) Position ids shape: torch.Size([1, 32752]) Input IDs shape: torch.Size([1, 32752]) Labels shape: torch.Size([1, 32752]) Final batch size: 1, sequence length: 33894 Attention mask shape: torch.Size([1, 1, 33894, 33894]) Position ids shape: torch.Size([1, 33894]) Input IDs shape: torch.Size([1, 33894]) Labels shape: torch.Size([1, 33894]) Final batch size: 1, sequence length: 24527 Attention mask shape: torch.Size([1, 1, 24527, 24527]) Position ids shape: torch.Size([1, 24527]) Input IDs shape: torch.Size([1, 24527]) Labels shape: torch.Size([1, 24527]) Final batch size: 1, sequence length: 22762 Attention mask shape: torch.Size([1, 1, 22762, 22762]) Position ids shape: torch.Size([1, 22762]) Input IDs shape: torch.Size([1, 22762]) Labels shape: torch.Size([1, 22762]) Final batch size: 1, sequence length: 11107 Attention mask shape: torch.Size([1, 1, 11107, 11107]) Position ids shape: torch.Size([1, 11107]) Input IDs shape: torch.Size([1, 11107]) Labels shape: torch.Size([1, 11107]) Final batch size: 1, sequence length: 30181 Attention mask shape: torch.Size([1, 1, 30181, 30181]) Position ids shape: torch.Size([1, 30181]) Input IDs shape: torch.Size([1, 30181]) Labels shape: torch.Size([1, 30181]) Final batch size: 1, sequence length: 27061 Attention mask shape: torch.Size([1, 1, 27061, 27061]) Position ids shape: torch.Size([1, 27061]) Input IDs shape: torch.Size([1, 27061]) Labels shape: torch.Size([1, 27061]) Final batch size: 1, sequence length: 31525 Attention mask shape: torch.Size([1, 1, 31525, 31525]) Position ids shape: torch.Size([1, 31525]) Input IDs shape: torch.Size([1, 31525]) Labels shape: torch.Size([1, 31525]) Final batch size: 1, sequence length: 32466 Attention mask shape: torch.Size([1, 1, 32466, 32466]) Position ids shape: torch.Size([1, 32466]) Input IDs shape: torch.Size([1, 32466]) Labels shape: torch.Size([1, 32466]) Final batch size: 1, sequence length: 33794 Attention mask shape: torch.Size([1, 1, 33794, 33794]) Position ids shape: torch.Size([1, 33794]) Input IDs shape: torch.Size([1, 33794]) Labels shape: torch.Size([1, 33794]) Final batch size: 1, sequence length: 21143 Attention mask shape: torch.Size([1, 1, 21143, 21143]) Position ids shape: torch.Size([1, 21143]) Input IDs shape: torch.Size([1, 21143]) Labels shape: torch.Size([1, 21143]) Final batch size: 1, sequence length: 34711 Attention mask shape: torch.Size([1, 1, 34711, 34711]) Position ids shape: torch.Size([1, 34711]) Input IDs shape: torch.Size([1, 34711]) Labels shape: torch.Size([1, 34711]) Final batch size: 1, sequence length: 34405 Attention mask shape: torch.Size([1, 1, 34405, 34405]) Position ids shape: torch.Size([1, 34405]) Input IDs shape: torch.Size([1, 34405]) Labels shape: torch.Size([1, 34405]) Final batch size: 1, sequence length: 37343 Attention mask shape: torch.Size([1, 1, 37343, 37343]) Position ids shape: torch.Size([1, 37343]) Input IDs shape: torch.Size([1, 37343]) Labels shape: torch.Size([1, 37343]) Final batch size: 1, sequence length: 34512 Attention mask shape: torch.Size([1, 1, 34512, 34512]) Position ids shape: torch.Size([1, 34512]) Input IDs shape: torch.Size([1, 34512]) Labels shape: torch.Size([1, 34512]) Final batch size: 1, sequence length: 20274 Attention mask shape: torch.Size([1, 1, 20274, 20274]) Position ids shape: torch.Size([1, 20274]) Input IDs shape: torch.Size([1, 20274]) Labels shape: torch.Size([1, 20274]) Final batch size: 1, sequence length: 13942 Attention mask shape: torch.Size([1, 1, 13942, 13942]) Position ids shape: torch.Size([1, 13942]) Input IDs shape: torch.Size([1, 13942]) Labels shape: torch.Size([1, 13942]) Final batch size: 1, sequence length: 17181 Attention mask shape: torch.Size([1, 1, 17181, 17181]) Position ids shape: torch.Size([1, 17181]) Input IDs shape: torch.Size([1, 17181]) Labels shape: torch.Size([1, 17181]) Final batch size: 1, sequence length: 33087 Attention mask shape: torch.Size([1, 1, 33087, 33087]) Position ids shape: torch.Size([1, 33087]) Input IDs shape: torch.Size([1, 33087]) Labels shape: torch.Size([1, 33087]) Final batch size: 1, sequence length: 31561 Attention mask shape: torch.Size([1, 1, 31561, 31561]) Position ids shape: torch.Size([1, 31561]) Input IDs shape: torch.Size([1, 31561]) Labels shape: torch.Size([1, 31561]) Final batch size: 1, sequence length: 38010 Attention mask shape: torch.Size([1, 1, 38010, 38010]) Position ids shape: torch.Size([1, 38010]) Input IDs shape: torch.Size([1, 38010]) Labels shape: torch.Size([1, 38010]) Final batch size: 1, sequence length: 31875 Attention mask shape: torch.Size([1, 1, 31875, 31875]) Position ids shape: torch.Size([1, 31875]) Input IDs shape: torch.Size([1, 31875]) Labels shape: torch.Size([1, 31875]) Final batch size: 1, sequence length: 16716 Attention mask shape: torch.Size([1, 1, 16716, 16716]) Position ids shape: torch.Size([1, 16716]) Input IDs shape: torch.Size([1, 16716]) Labels shape: torch.Size([1, 16716]) Final batch size: 1, sequence length: 32891 Attention mask shape: torch.Size([1, 1, 32891, 32891]) Position ids shape: torch.Size([1, 32891]) Input IDs shape: torch.Size([1, 32891]) Labels shape: torch.Size([1, 32891]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 37555 Attention mask shape: torch.Size([1, 1, 37555, 37555]) Position ids shape: torch.Size([1, 37555]) Input IDs shape: torch.Size([1, 37555]) Labels shape: torch.Size([1, 37555]) Final batch size: 1, sequence length: 29216 Attention mask shape: torch.Size([1, 1, 29216, 29216]) Position ids shape: torch.Size([1, 29216]) Input IDs shape: torch.Size([1, 29216]) Labels shape: torch.Size([1, 29216]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 31518 Attention mask shape: torch.Size([1, 1, 31518, 31518]) Position ids shape: torch.Size([1, 31518]) Input IDs shape: torch.Size([1, 31518]) Labels shape: torch.Size([1, 31518]) Final batch size: 1, sequence length: 35523 Attention mask shape: torch.Size([1, 1, 35523, 35523]) Position ids shape: torch.Size([1, 35523]) Input IDs shape: torch.Size([1, 35523]) Labels shape: torch.Size([1, 35523]) Final batch size: 1, sequence length: 33367 Attention mask shape: torch.Size([1, 1, 33367, 33367]) Position ids shape: torch.Size([1, 33367]) Input IDs shape: torch.Size([1, 33367]) Labels shape: torch.Size([1, 33367]) Final batch size: 1, sequence length: 40910 Attention mask shape: torch.Size([1, 1, 40910, 40910]) Position ids shape: torch.Size([1, 40910]) Input IDs shape: torch.Size([1, 40910]) Labels shape: torch.Size([1, 40910]) Final batch size: 1, sequence length: 18229 Attention mask shape: torch.Size([1, 1, 18229, 18229]) Position ids shape: torch.Size([1, 18229]) Input IDs shape: torch.Size([1, 18229]) Labels shape: torch.Size([1, 18229]) Final batch size: 1, sequence length: 16433 Attention mask shape: torch.Size([1, 1, 16433, 16433]) Position ids shape: torch.Size([1, 16433]) Input IDs shape: torch.Size([1, 16433]) Labels shape: torch.Size([1, 16433]) Final batch size: 1, sequence length: 13978 Attention mask shape: torch.Size([1, 1, 13978, 13978]) Position ids shape: torch.Size([1, 13978]) Input IDs shape: torch.Size([1, 13978]) Labels shape: torch.Size([1, 13978]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 36450 Attention mask shape: torch.Size([1, 1, 36450, 36450]) Position ids shape: torch.Size([1, 36450]) Input IDs shape: torch.Size([1, 36450]) Labels shape: torch.Size([1, 36450]) Final batch size: 1, sequence length: 36464 Attention mask shape: torch.Size([1, 1, 36464, 36464]) Position ids shape: torch.Size([1, 36464]) Input IDs shape: torch.Size([1, 36464]) Labels shape: torch.Size([1, 36464]) Final batch size: 1, sequence length: 37349 Attention mask shape: torch.Size([1, 1, 37349, 37349]) Position ids shape: torch.Size([1, 37349]) Input IDs shape: torch.Size([1, 37349]) Labels shape: torch.Size([1, 37349]) Final batch size: 1, sequence length: 21859 Attention mask shape: torch.Size([1, 1, 21859, 21859]) Position ids shape: torch.Size([1, 21859]) Input IDs shape: torch.Size([1, 21859]) Labels shape: torch.Size([1, 21859]) Final batch size: 1, sequence length: 29042 Attention mask shape: torch.Size([1, 1, 29042, 29042]) Position ids shape: torch.Size([1, 29042]) Input IDs shape: torch.Size([1, 29042]) Labels shape: torch.Size([1, 29042]) Final batch size: 1, sequence length: 30134 Attention mask shape: torch.Size([1, 1, 30134, 30134]) Position ids shape: torch.Size([1, 30134]) Input IDs shape: torch.Size([1, 30134]) Labels shape: torch.Size([1, 30134]) Final batch size: 1, sequence length: 40752 Attention mask shape: torch.Size([1, 1, 40752, 40752]) Position ids shape: torch.Size([1, 40752]) Input IDs shape: torch.Size([1, 40752]) Labels shape: torch.Size([1, 40752]) Final batch size: 1, sequence length: 31449 Attention mask shape: torch.Size([1, 1, 31449, 31449]) Position ids shape: torch.Size([1, 31449]) Input IDs shape: torch.Size([1, 31449]) Labels shape: torch.Size([1, 31449]) Final batch size: 1, sequence length: 26922 Attention mask shape: torch.Size([1, 1, 26922, 26922]) Position ids shape: torch.Size([1, 26922]) Input IDs shape: torch.Size([1, 26922]) Labels shape: torch.Size([1, 26922]) Final batch size: 1, sequence length: 17758 Attention mask shape: torch.Size([1, 1, 17758, 17758]) Position ids shape: torch.Size([1, 17758]) Input IDs shape: torch.Size([1, 17758]) Labels shape: torch.Size([1, 17758]) Final batch size: 1, sequence length: 19287 Attention mask shape: torch.Size([1, 1, 19287, 19287]) Position ids shape: torch.Size([1, 19287]) Input IDs shape: torch.Size([1, 19287]) Labels shape: torch.Size([1, 19287]) Final batch size: 1, sequence length: 17535 Attention mask shape: torch.Size([1, 1, 17535, 17535]) Position ids shape: torch.Size([1, 17535]) Input IDs shape: torch.Size([1, 17535]) Labels shape: torch.Size([1, 17535]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 39258 Attention mask shape: torch.Size([1, 1, 39258, 39258]) Position ids shape: torch.Size([1, 39258]) Input IDs shape: torch.Size([1, 39258]) Labels shape: torch.Size([1, 39258]) Final batch size: 1, sequence length: 39169 Attention mask shape: torch.Size([1, 1, 39169, 39169]) Position ids shape: torch.Size([1, 39169]) Input IDs shape: torch.Size([1, 39169]) Labels shape: torch.Size([1, 39169]) Final batch size: 1, sequence length: 38529 Attention mask shape: torch.Size([1, 1, 38529, 38529]) Position ids shape: torch.Size([1, 38529]) Input IDs shape: torch.Size([1, 38529]) Labels shape: torch.Size([1, 38529]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 26893 Attention mask shape: torch.Size([1, 1, 26893, 26893]) Position ids shape: torch.Size([1, 26893]) Input IDs shape: torch.Size([1, 26893]) Labels shape: torch.Size([1, 26893]) Final batch size: 1, sequence length: 17890 Attention mask shape: torch.Size([1, 1, 17890, 17890]) Position ids shape: torch.Size([1, 17890]) Input IDs shape: torch.Size([1, 17890]) Labels shape: torch.Size([1, 17890]) Final batch size: 1, sequence length: 36599 Attention mask shape: torch.Size([1, 1, 36599, 36599]) Position ids shape: torch.Size([1, 36599]) Input IDs shape: torch.Size([1, 36599]) Labels shape: torch.Size([1, 36599]) Final batch size: 1, sequence length: 38891 Attention mask shape: torch.Size([1, 1, 38891, 38891]) Position ids shape: torch.Size([1, 38891]) Input IDs shape: torch.Size([1, 38891]) Labels shape: torch.Size([1, 38891]) Final batch size: 1, sequence length: 32638 Attention mask shape: torch.Size([1, 1, 32638, 32638]) Position ids shape: torch.Size([1, 32638]) Input IDs shape: torch.Size([1, 32638]) Labels shape: torch.Size([1, 32638]) Final batch size: 1, sequence length: 32465 Attention mask shape: torch.Size([1, 1, 32465, 32465]) Position ids shape: torch.Size([1, 32465]) Input IDs shape: torch.Size([1, 32465]) Labels shape: torch.Size([1, 32465]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 18471 Attention mask shape: torch.Size([1, 1, 18471, 18471]) Position ids shape: torch.Size([1, 18471]) Input IDs shape: torch.Size([1, 18471]) Labels shape: torch.Size([1, 18471]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 39953 Attention mask shape: torch.Size([1, 1, 39953, 39953]) Position ids shape: torch.Size([1, 39953]) Input IDs shape: torch.Size([1, 39953]) Labels shape: torch.Size([1, 39953]) {'loss': 0.3042, 'grad_norm': 0.42428071654895794, 'learning_rate': 8.715724127386971e-06, 'num_tokens': -inf, 'epoch': 2.38} Final batch size: 1, sequence length: 5818 Attention mask shape: torch.Size([1, 1, 5818, 5818]) Position ids shape: torch.Size([1, 5818]) Input IDs shape: torch.Size([1, 5818]) Labels shape: torch.Size([1, 5818]) Final batch size: 1, sequence length: 7977 Attention mask shape: torch.Size([1, 1, 7977, 7977]) Position ids shape: torch.Size([1, 7977]) Input IDs shape: torch.Size([1, 7977]) Labels shape: torch.Size([1, 7977]) Final batch size: 1, sequence length: 6215 Attention mask shape: torch.Size([1, 1, 6215, 6215]) Position ids shape: torch.Size([1, 6215]) Input IDs shape: torch.Size([1, 6215]) Labels shape: torch.Size([1, 6215]) Final batch size: 1, sequence length: 6095 Attention mask shape: torch.Size([1, 1, 6095, 6095]) Position ids shape: torch.Size([1, 6095]) Input IDs shape: torch.Size([1, 6095]) Labels shape: torch.Size([1, 6095]) Final batch size: 1, sequence length: 8500 Attention mask shape: torch.Size([1, 1, 8500, 8500]) Position ids shape: torch.Size([1, 8500]) Input IDs shape: torch.Size([1, 8500]) Labels shape: torch.Size([1, 8500]) Final batch size: 1, sequence length: 10080 Attention mask shape: torch.Size([1, 1, 10080, 10080]) Position ids shape: torch.Size([1, 10080]) Input IDs shape: torch.Size([1, 10080]) Labels shape: torch.Size([1, 10080]) Final batch size: 1, sequence length: 12454 Attention mask shape: torch.Size([1, 1, 12454, 12454]) Position ids shape: torch.Size([1, 12454]) Input IDs shape: torch.Size([1, 12454]) Labels shape: torch.Size([1, 12454]) Final batch size: 1, sequence length: 10107 Attention mask shape: torch.Size([1, 1, 10107, 10107]) Position ids shape: torch.Size([1, 10107]) Input IDs shape: torch.Size([1, 10107]) Labels shape: torch.Size([1, 10107]) Final batch size: 1, sequence length: 12826 Attention mask shape: torch.Size([1, 1, 12826, 12826]) Position ids shape: torch.Size([1, 12826]) Input IDs shape: torch.Size([1, 12826]) Labels shape: torch.Size([1, 12826]) Final batch size: 1, sequence length: 10505 Attention mask shape: torch.Size([1, 1, 10505, 10505]) Position ids shape: torch.Size([1, 10505]) Input IDs shape: torch.Size([1, 10505]) Labels shape: torch.Size([1, 10505]) Final batch size: 1, sequence length: 13092 Attention mask shape: torch.Size([1, 1, 13092, 13092]) Position ids shape: torch.Size([1, 13092]) Input IDs shape: torch.Size([1, 13092]) Labels shape: torch.Size([1, 13092]) Final batch size: 1, sequence length: 13624 Attention mask shape: torch.Size([1, 1, 13624, 13624]) Position ids shape: torch.Size([1, 13624]) Input IDs shape: torch.Size([1, 13624]) Labels shape: torch.Size([1, 13624]) Final batch size: 1, sequence length: 9217 Attention mask shape: torch.Size([1, 1, 9217, 9217]) Position ids shape: torch.Size([1, 9217]) Input IDs shape: torch.Size([1, 9217]) Labels shape: torch.Size([1, 9217]) Final batch size: 1, sequence length: 10687 Attention mask shape: torch.Size([1, 1, 10687, 10687]) Position ids shape: torch.Size([1, 10687]) Input IDs shape: torch.Size([1, 10687]) Labels shape: torch.Size([1, 10687]) Final batch size: 1, sequence length: 12928 Attention mask shape: torch.Size([1, 1, 12928, 12928]) Position ids shape: torch.Size([1, 12928]) Input IDs shape: torch.Size([1, 12928]) Labels shape: torch.Size([1, 12928]) Final batch size: 1, sequence length: 9379 Attention mask shape: torch.Size([1, 1, 9379, 9379]) Position ids shape: torch.Size([1, 9379]) Input IDs shape: torch.Size([1, 9379]) Labels shape: torch.Size([1, 9379]) Final batch size: 1, sequence length: 13459 Attention mask shape: torch.Size([1, 1, 13459, 13459]) Position ids shape: torch.Size([1, 13459]) Input IDs shape: torch.Size([1, 13459]) Labels shape: torch.Size([1, 13459]) Final batch size: 1, sequence length: 16782 Attention mask shape: torch.Size([1, 1, 16782, 16782]) Position ids shape: torch.Size([1, 16782]) Input IDs shape: torch.Size([1, 16782]) Labels shape: torch.Size([1, 16782]) Final batch size: 1, sequence length: 10469 Attention mask shape: torch.Size([1, 1, 10469, 10469]) Position ids shape: torch.Size([1, 10469]) Input IDs shape: torch.Size([1, 10469]) Labels shape: torch.Size([1, 10469]) Final batch size: 1, sequence length: 15185 Attention mask shape: torch.Size([1, 1, 15185, 15185]) Position ids shape: torch.Size([1, 15185]) Input IDs shape: torch.Size([1, 15185]) Labels shape: torch.Size([1, 15185]) Final batch size: 1, sequence length: 12556 Attention mask shape: torch.Size([1, 1, 12556, 12556]) Position ids shape: torch.Size([1, 12556]) Input IDs shape: torch.Size([1, 12556]) Labels shape: torch.Size([1, 12556]) Final batch size: 1, sequence length: 10390 Attention mask shape: torch.Size([1, 1, 10390, 10390]) Position ids shape: torch.Size([1, 10390]) Input IDs shape: torch.Size([1, 10390]) Labels shape: torch.Size([1, 10390]) Final batch size: 1, sequence length: 17417 Attention mask shape: torch.Size([1, 1, 17417, 17417]) Position ids shape: torch.Size([1, 17417]) Input IDs shape: torch.Size([1, 17417]) Labels shape: torch.Size([1, 17417]) Final batch size: 1, sequence length: 18469 Attention mask shape: torch.Size([1, 1, 18469, 18469]) Position ids shape: torch.Size([1, 18469]) Input IDs shape: torch.Size([1, 18469]) Labels shape: torch.Size([1, 18469]) Final batch size: 1, sequence length: 19671 Attention mask shape: torch.Size([1, 1, 19671, 19671]) Position ids shape: torch.Size([1, 19671]) Input IDs shape: torch.Size([1, 19671]) Labels shape: torch.Size([1, 19671]) Final batch size: 1, sequence length: 19683 Attention mask shape: torch.Size([1, 1, 19683, 19683]) Position ids shape: torch.Size([1, 19683]) Input IDs shape: torch.Size([1, 19683]) Labels shape: torch.Size([1, 19683]) Final batch size: 1, sequence length: 17980 Attention mask shape: torch.Size([1, 1, 17980, 17980]) Position ids shape: torch.Size([1, 17980]) Input IDs shape: torch.Size([1, 17980]) Labels shape: torch.Size([1, 17980]) Final batch size: 1, sequence length: 10277 Attention mask shape: torch.Size([1, 1, 10277, 10277]) Position ids shape: torch.Size([1, 10277]) Input IDs shape: torch.Size([1, 10277]) Labels shape: torch.Size([1, 10277]) Final batch size: 1, sequence length: 19187 Attention mask shape: torch.Size([1, 1, 19187, 19187]) Position ids shape: torch.Size([1, 19187]) Input IDs shape: torch.Size([1, 19187]) Labels shape: torch.Size([1, 19187]) Final batch size: 1, sequence length: 18454 Attention mask shape: torch.Size([1, 1, 18454, 18454]) Position ids shape: torch.Size([1, 18454]) Input IDs shape: torch.Size([1, 18454]) Labels shape: torch.Size([1, 18454]) Final batch size: 1, sequence length: 20191 Attention mask shape: torch.Size([1, 1, 20191, 20191]) Position ids shape: torch.Size([1, 20191]) Input IDs shape: torch.Size([1, 20191]) Labels shape: torch.Size([1, 20191]) Final batch size: 1, sequence length: 21051 Attention mask shape: torch.Size([1, 1, 21051, 21051]) Position ids shape: torch.Size([1, 21051]) Input IDs shape: torch.Size([1, 21051]) Labels shape: torch.Size([1, 21051]) Final batch size: 1, sequence length: 15263 Attention mask shape: torch.Size([1, 1, 15263, 15263]) Position ids shape: torch.Size([1, 15263]) Input IDs shape: torch.Size([1, 15263]) Labels shape: torch.Size([1, 15263]) Final batch size: 1, sequence length: 20243 Attention mask shape: torch.Size([1, 1, 20243, 20243]) Position ids shape: torch.Size([1, 20243]) Input IDs shape: torch.Size([1, 20243]) Labels shape: torch.Size([1, 20243]) Final batch size: 1, sequence length: 14892 Attention mask shape: torch.Size([1, 1, 14892, 14892]) Position ids shape: torch.Size([1, 14892]) Input IDs shape: torch.Size([1, 14892]) Labels shape: torch.Size([1, 14892]) Final batch size: 1, sequence length: 16585 Attention mask shape: torch.Size([1, 1, 16585, 16585]) Position ids shape: torch.Size([1, 16585]) Input IDs shape: torch.Size([1, 16585]) Labels shape: torch.Size([1, 16585]) Final batch size: 1, sequence length: 10182 Attention mask shape: torch.Size([1, 1, 10182, 10182]) Position ids shape: torch.Size([1, 10182]) Input IDs shape: torch.Size([1, 10182]) Labels shape: torch.Size([1, 10182]) Final batch size: 1, sequence length: 15339 Attention mask shape: torch.Size([1, 1, 15339, 15339]) Position ids shape: torch.Size([1, 15339]) Input IDs shape: torch.Size([1, 15339]) Labels shape: torch.Size([1, 15339]) Final batch size: 1, sequence length: 22133 Attention mask shape: torch.Size([1, 1, 22133, 22133]) Position ids shape: torch.Size([1, 22133]) Input IDs shape: torch.Size([1, 22133]) Labels shape: torch.Size([1, 22133]) Final batch size: 1, sequence length: 20562 Attention mask shape: torch.Size([1, 1, 20562, 20562]) Position ids shape: torch.Size([1, 20562]) Input IDs shape: torch.Size([1, 20562]) Labels shape: torch.Size([1, 20562]) Final batch size: 1, sequence length: 20432 Attention mask shape: torch.Size([1, 1, 20432, 20432]) Position ids shape: torch.Size([1, 20432]) Input IDs shape: torch.Size([1, 20432]) Labels shape: torch.Size([1, 20432]) Final batch size: 1, sequence length: 23238 Attention mask shape: torch.Size([1, 1, 23238, 23238]) Position ids shape: torch.Size([1, 23238]) Input IDs shape: torch.Size([1, 23238]) Labels shape: torch.Size([1, 23238]) Final batch size: 1, sequence length: 22857 Attention mask shape: torch.Size([1, 1, 22857, 22857]) Position ids shape: torch.Size([1, 22857]) Input IDs shape: torch.Size([1, 22857]) Labels shape: torch.Size([1, 22857]) Final batch size: 1, sequence length: 24002 Attention mask shape: torch.Size([1, 1, 24002, 24002]) Position ids shape: torch.Size([1, 24002]) Input IDs shape: torch.Size([1, 24002]) Labels shape: torch.Size([1, 24002]) Final batch size: 1, sequence length: 24769 Attention mask shape: torch.Size([1, 1, 24769, 24769]) Position ids shape: torch.Size([1, 24769]) Input IDs shape: torch.Size([1, 24769]) Labels shape: torch.Size([1, 24769]) Final batch size: 1, sequence length: 24428 Attention mask shape: torch.Size([1, 1, 24428, 24428]) Position ids shape: torch.Size([1, 24428]) Input IDs shape: torch.Size([1, 24428]) Labels shape: torch.Size([1, 24428]) Final batch size: 1, sequence length: 14556 Attention mask shape: torch.Size([1, 1, 14556, 14556]) Position ids shape: torch.Size([1, 14556]) Input IDs shape: torch.Size([1, 14556]) Labels shape: torch.Size([1, 14556]) Final batch size: 1, sequence length: 12012 Attention mask shape: torch.Size([1, 1, 12012, 12012]) Position ids shape: torch.Size([1, 12012]) Input IDs shape: torch.Size([1, 12012]) Labels shape: torch.Size([1, 12012]) Final batch size: 1, sequence length: 24255 Attention mask shape: torch.Size([1, 1, 24255, 24255]) Position ids shape: torch.Size([1, 24255]) Input IDs shape: torch.Size([1, 24255]) Labels shape: torch.Size([1, 24255]) Final batch size: 1, sequence length: 26033 Attention mask shape: torch.Size([1, 1, 26033, 26033]) Position ids shape: torch.Size([1, 26033]) Input IDs shape: torch.Size([1, 26033]) Labels shape: torch.Size([1, 26033]) Final batch size: 1, sequence length: 26144 Attention mask shape: torch.Size([1, 1, 26144, 26144]) Position ids shape: torch.Size([1, 26144]) Input IDs shape: torch.Size([1, 26144]) Labels shape: torch.Size([1, 26144]) Final batch size: 1, sequence length: 26312 Attention mask shape: torch.Size([1, 1, 26312, 26312]) Position ids shape: torch.Size([1, 26312]) Input IDs shape: torch.Size([1, 26312]) Labels shape: torch.Size([1, 26312]) Final batch size: 1, sequence length: 16541 Attention mask shape: torch.Size([1, 1, 16541, 16541]) Position ids shape: torch.Size([1, 16541]) Input IDs shape: torch.Size([1, 16541]) Labels shape: torch.Size([1, 16541]) Final batch size: 1, sequence length: 21664 Attention mask shape: torch.Size([1, 1, 21664, 21664]) Position ids shape: torch.Size([1, 21664]) Input IDs shape: torch.Size([1, 21664]) Labels shape: torch.Size([1, 21664]) Final batch size: 1, sequence length: 25707 Attention mask shape: torch.Size([1, 1, 25707, 25707]) Position ids shape: torch.Size([1, 25707]) Input IDs shape: torch.Size([1, 25707]) Labels shape: torch.Size([1, 25707]) Final batch size: 1, sequence length: 24858 Attention mask shape: torch.Size([1, 1, 24858, 24858]) Position ids shape: torch.Size([1, 24858]) Input IDs shape: torch.Size([1, 24858]) Labels shape: torch.Size([1, 24858]) Final batch size: 1, sequence length: 23602 Attention mask shape: torch.Size([1, 1, 23602, 23602]) Position ids shape: torch.Size([1, 23602]) Input IDs shape: torch.Size([1, 23602]) Labels shape: torch.Size([1, 23602]) Final batch size: 1, sequence length: 29948 Attention mask shape: torch.Size([1, 1, 29948, 29948]) Position ids shape: torch.Size([1, 29948]) Input IDs shape: torch.Size([1, 29948]) Labels shape: torch.Size([1, 29948]) Final batch size: 1, sequence length: 10318 Attention mask shape: torch.Size([1, 1, 10318, 10318]) Position ids shape: torch.Size([1, 10318]) Input IDs shape: torch.Size([1, 10318]) Labels shape: torch.Size([1, 10318]) Final batch size: 1, sequence length: 25758 Attention mask shape: torch.Size([1, 1, 25758, 25758]) Position ids shape: torch.Size([1, 25758]) Input IDs shape: torch.Size([1, 25758]) Labels shape: torch.Size([1, 25758]) Final batch size: 1, sequence length: 30079 Attention mask shape: torch.Size([1, 1, 30079, 30079]) Position ids shape: torch.Size([1, 30079]) Input IDs shape: torch.Size([1, 30079]) Labels shape: torch.Size([1, 30079]) Final batch size: 1, sequence length: 23698 Attention mask shape: torch.Size([1, 1, 23698, 23698]) Position ids shape: torch.Size([1, 23698]) Input IDs shape: torch.Size([1, 23698]) Labels shape: torch.Size([1, 23698]) Final batch size: 1, sequence length: 21374 Attention mask shape: torch.Size([1, 1, 21374, 21374]) Position ids shape: torch.Size([1, 21374]) Input IDs shape: torch.Size([1, 21374]) Labels shape: torch.Size([1, 21374]) Final batch size: 1, sequence length: 25719 Attention mask shape: torch.Size([1, 1, 25719, 25719]) Position ids shape: torch.Size([1, 25719]) Input IDs shape: torch.Size([1, 25719]) Labels shape: torch.Size([1, 25719]) Final batch size: 1, sequence length: 29825 Attention mask shape: torch.Size([1, 1, 29825, 29825]) Position ids shape: torch.Size([1, 29825]) Input IDs shape: torch.Size([1, 29825]) Labels shape: torch.Size([1, 29825]) Final batch size: 1, sequence length: 19238 Attention mask shape: torch.Size([1, 1, 19238, 19238]) Position ids shape: torch.Size([1, 19238]) Input IDs shape: torch.Size([1, 19238]) Labels shape: torch.Size([1, 19238]) Final batch size: 1, sequence length: 32541 Attention mask shape: torch.Size([1, 1, 32541, 32541]) Position ids shape: torch.Size([1, 32541]) Input IDs shape: torch.Size([1, 32541]) Labels shape: torch.Size([1, 32541]) Final batch size: 1, sequence length: 33611 Attention mask shape: torch.Size([1, 1, 33611, 33611]) Position ids shape: torch.Size([1, 33611]) Input IDs shape: torch.Size([1, 33611]) Labels shape: torch.Size([1, 33611]) Final batch size: 1, sequence length: 29749 Attention mask shape: torch.Size([1, 1, 29749, 29749]) Position ids shape: torch.Size([1, 29749]) Input IDs shape: torch.Size([1, 29749]) Labels shape: torch.Size([1, 29749]) Final batch size: 1, sequence length: 25886 Attention mask shape: torch.Size([1, 1, 25886, 25886]) Position ids shape: torch.Size([1, 25886]) Input IDs shape: torch.Size([1, 25886]) Labels shape: torch.Size([1, 25886]) Final batch size: 1, sequence length: 25656 Attention mask shape: torch.Size([1, 1, 25656, 25656]) Position ids shape: torch.Size([1, 25656]) Input IDs shape: torch.Size([1, 25656]) Labels shape: torch.Size([1, 25656]) Final batch size: 1, sequence length: 32071 Attention mask shape: torch.Size([1, 1, 32071, 32071]) Position ids shape: torch.Size([1, 32071]) Input IDs shape: torch.Size([1, 32071]) Labels shape: torch.Size([1, 32071]) Final batch size: 1, sequence length: 15604 Attention mask shape: torch.Size([1, 1, 15604, 15604]) Position ids shape: torch.Size([1, 15604]) Input IDs shape: torch.Size([1, 15604]) Labels shape: torch.Size([1, 15604]) Final batch size: 1, sequence length: 22139 Attention mask shape: torch.Size([1, 1, 22139, 22139]) Position ids shape: torch.Size([1, 22139]) Input IDs shape: torch.Size([1, 22139]) Labels shape: torch.Size([1, 22139]) Final batch size: 1, sequence length: 23399 Attention mask shape: torch.Size([1, 1, 23399, 23399]) Position ids shape: torch.Size([1, 23399]) Input IDs shape: torch.Size([1, 23399]) Labels shape: torch.Size([1, 23399]) Final batch size: 1, sequence length: 25492 Attention mask shape: torch.Size([1, 1, 25492, 25492]) Position ids shape: torch.Size([1, 25492]) Input IDs shape: torch.Size([1, 25492]) Labels shape: torch.Size([1, 25492]) Final batch size: 1, sequence length: 9897 Attention mask shape: torch.Size([1, 1, 9897, 9897]) Position ids shape: torch.Size([1, 9897]) Input IDs shape: torch.Size([1, 9897]) Labels shape: torch.Size([1, 9897]) Final batch size: 1, sequence length: 36778 Attention mask shape: torch.Size([1, 1, 36778, 36778]) Position ids shape: torch.Size([1, 36778]) Input IDs shape: torch.Size([1, 36778]) Labels shape: torch.Size([1, 36778]) Final batch size: 1, sequence length: 35760 Attention mask shape: torch.Size([1, 1, 35760, 35760]) Position ids shape: torch.Size([1, 35760]) Input IDs shape: torch.Size([1, 35760]) Labels shape: torch.Size([1, 35760]) Final batch size: 1, sequence length: 37241 Attention mask shape: torch.Size([1, 1, 37241, 37241]) Position ids shape: torch.Size([1, 37241]) Input IDs shape: torch.Size([1, 37241]) Labels shape: torch.Size([1, 37241]) Final batch size: 1, sequence length: 34701 Attention mask shape: torch.Size([1, 1, 34701, 34701]) Position ids shape: torch.Size([1, 34701]) Input IDs shape: torch.Size([1, 34701]) Labels shape: torch.Size([1, 34701]) Final batch size: 1, sequence length: 34186 Attention mask shape: torch.Size([1, 1, 34186, 34186]) Position ids shape: torch.Size([1, 34186]) Input IDs shape: torch.Size([1, 34186]) Labels shape: torch.Size([1, 34186]) Final batch size: 1, sequence length: 23343 Attention mask shape: torch.Size([1, 1, 23343, 23343]) Position ids shape: torch.Size([1, 23343]) Input IDs shape: torch.Size([1, 23343]) Labels shape: torch.Size([1, 23343]) Final batch size: 1, sequence length: 37728 Attention mask shape: torch.Size([1, 1, 37728, 37728]) Position ids shape: torch.Size([1, 37728]) Input IDs shape: torch.Size([1, 37728]) Labels shape: torch.Size([1, 37728]) Final batch size: 1, sequence length: 40593 Attention mask shape: torch.Size([1, 1, 40593, 40593]) Position ids shape: torch.Size([1, 40593]) Input IDs shape: torch.Size([1, 40593]) Labels shape: torch.Size([1, 40593]) Final batch size: 1, sequence length: 39760 Attention mask shape: torch.Size([1, 1, 39760, 39760]) Position ids shape: torch.Size([1, 39760]) Input IDs shape: torch.Size([1, 39760]) Labels shape: torch.Size([1, 39760]) Final batch size: 1, sequence length: 33459 Attention mask shape: torch.Size([1, 1, 33459, 33459]) Position ids shape: torch.Size([1, 33459]) Input IDs shape: torch.Size([1, 33459]) Labels shape: torch.Size([1, 33459]) Final batch size: 1, sequence length: 36456 Attention mask shape: torch.Size([1, 1, 36456, 36456]) Position ids shape: torch.Size([1, 36456]) Input IDs shape: torch.Size([1, 36456]) Labels shape: torch.Size([1, 36456]) Final batch size: 1, sequence length: 7681 Attention mask shape: torch.Size([1, 1, 7681, 7681]) Position ids shape: torch.Size([1, 7681]) Input IDs shape: torch.Size([1, 7681]) Labels shape: torch.Size([1, 7681]) Final batch size: 1, sequence length: 40147 Attention mask shape: torch.Size([1, 1, 40147, 40147]) Position ids shape: torch.Size([1, 40147]) Input IDs shape: torch.Size([1, 40147]) Labels shape: torch.Size([1, 40147]) Final batch size: 1, sequence length: 15294 Attention mask shape: torch.Size([1, 1, 15294, 15294]) Position ids shape: torch.Size([1, 15294]) Input IDs shape: torch.Size([1, 15294]) Labels shape: torch.Size([1, 15294]) Final batch size: 1, sequence length: 31650 Attention mask shape: torch.Size([1, 1, 31650, 31650]) Position ids shape: torch.Size([1, 31650]) Input IDs shape: torch.Size([1, 31650]) Labels shape: torch.Size([1, 31650]) Final batch size: 1, sequence length: 31323 Attention mask shape: torch.Size([1, 1, 31323, 31323]) Position ids shape: torch.Size([1, 31323]) Input IDs shape: torch.Size([1, 31323]) Labels shape: torch.Size([1, 31323]) Final batch size: 1, sequence length: 22491 Attention mask shape: torch.Size([1, 1, 22491, 22491]) Position ids shape: torch.Size([1, 22491]) Input IDs shape: torch.Size([1, 22491]) Labels shape: torch.Size([1, 22491]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40507 Attention mask shape: torch.Size([1, 1, 40507, 40507]) Position ids shape: torch.Size([1, 40507]) Input IDs shape: torch.Size([1, 40507]) Labels shape: torch.Size([1, 40507]) Final batch size: 1, sequence length: 21955 Attention mask shape: torch.Size([1, 1, 21955, 21955]) Position ids shape: torch.Size([1, 21955]) Input IDs shape: torch.Size([1, 21955]) Labels shape: torch.Size([1, 21955]) Final batch size: 1, sequence length: 27283 Attention mask shape: torch.Size([1, 1, 27283, 27283]) Position ids shape: torch.Size([1, 27283]) Input IDs shape: torch.Size([1, 27283]) Labels shape: torch.Size([1, 27283]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 21557 Attention mask shape: torch.Size([1, 1, 21557, 21557]) Position ids shape: torch.Size([1, 21557]) Input IDs shape: torch.Size([1, 21557]) Labels shape: torch.Size([1, 21557]) Final batch size: 1, sequence length: 9445 Attention mask shape: torch.Size([1, 1, 9445, 9445]) Position ids shape: torch.Size([1, 9445]) Input IDs shape: torch.Size([1, 9445]) Labels shape: torch.Size([1, 9445]) Final batch size: 1, sequence length: 21489 Attention mask shape: torch.Size([1, 1, 21489, 21489]) Position ids shape: torch.Size([1, 21489]) Input IDs shape: torch.Size([1, 21489]) Labels shape: torch.Size([1, 21489]) Final batch size: 1, sequence length: 29082 Attention mask shape: torch.Size([1, 1, 29082, 29082]) Position ids shape: torch.Size([1, 29082]) Input IDs shape: torch.Size([1, 29082]) Labels shape: torch.Size([1, 29082]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 22434 Attention mask shape: torch.Size([1, 1, 22434, 22434]) Position ids shape: torch.Size([1, 22434]) Input IDs shape: torch.Size([1, 22434]) Labels shape: torch.Size([1, 22434]) Final batch size: 1, sequence length: 37775 Attention mask shape: torch.Size([1, 1, 37775, 37775]) Position ids shape: torch.Size([1, 37775]) Input IDs shape: torch.Size([1, 37775]) Labels shape: torch.Size([1, 37775]) Final batch size: 1, sequence length: 17646 Attention mask shape: torch.Size([1, 1, 17646, 17646]) Position ids shape: torch.Size([1, 17646]) Input IDs shape: torch.Size([1, 17646]) Labels shape: torch.Size([1, 17646]) Final batch size: 1, sequence length: 19867 Attention mask shape: torch.Size([1, 1, 19867, 19867]) Position ids shape: torch.Size([1, 19867]) Input IDs shape: torch.Size([1, 19867]) Labels shape: torch.Size([1, 19867]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 26664 Attention mask shape: torch.Size([1, 1, 26664, 26664]) Position ids shape: torch.Size([1, 26664]) Input IDs shape: torch.Size([1, 26664]) Labels shape: torch.Size([1, 26664]) Final batch size: 1, sequence length: 37183 Attention mask shape: torch.Size([1, 1, 37183, 37183]) Position ids shape: torch.Size([1, 37183]) Input IDs shape: torch.Size([1, 37183]) Labels shape: torch.Size([1, 37183]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 23014 Attention mask shape: torch.Size([1, 1, 23014, 23014]) Position ids shape: torch.Size([1, 23014]) Input IDs shape: torch.Size([1, 23014]) Labels shape: torch.Size([1, 23014]) Final batch size: 1, sequence length: 16273 Attention mask shape: torch.Size([1, 1, 16273, 16273]) Position ids shape: torch.Size([1, 16273]) Input IDs shape: torch.Size([1, 16273]) Labels shape: torch.Size([1, 16273]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 38686 Attention mask shape: torch.Size([1, 1, 38686, 38686]) Position ids shape: torch.Size([1, 38686]) Input IDs shape: torch.Size([1, 38686]) Labels shape: torch.Size([1, 38686]) Final batch size: 1, sequence length: 36579 Attention mask shape: torch.Size([1, 1, 36579, 36579]) Position ids shape: torch.Size([1, 36579]) Input IDs shape: torch.Size([1, 36579]) Labels shape: torch.Size([1, 36579]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 28661 Attention mask shape: torch.Size([1, 1, 28661, 28661]) Position ids shape: torch.Size([1, 28661]) Input IDs shape: torch.Size([1, 28661]) Labels shape: torch.Size([1, 28661]) Final batch size: 1, sequence length: 33263 Attention mask shape: torch.Size([1, 1, 33263, 33263]) Position ids shape: torch.Size([1, 33263]) Input IDs shape: torch.Size([1, 33263]) Labels shape: torch.Size([1, 33263]) Final batch size: 1, sequence length: 37091 Attention mask shape: torch.Size([1, 1, 37091, 37091]) Position ids shape: torch.Size([1, 37091]) Input IDs shape: torch.Size([1, 37091]) Labels shape: torch.Size([1, 37091]) Final batch size: 1, sequence length: 30364 Attention mask shape: torch.Size([1, 1, 30364, 30364]) Position ids shape: torch.Size([1, 30364]) Input IDs shape: torch.Size([1, 30364]) Labels shape: torch.Size([1, 30364]) Final batch size: 1, sequence length: 25914 Attention mask shape: torch.Size([1, 1, 25914, 25914]) Position ids shape: torch.Size([1, 25914]) Input IDs shape: torch.Size([1, 25914]) Labels shape: torch.Size([1, 25914]) Final batch size: 1, sequence length: 36297 Attention mask shape: torch.Size([1, 1, 36297, 36297]) Position ids shape: torch.Size([1, 36297]) Input IDs shape: torch.Size([1, 36297]) Labels shape: torch.Size([1, 36297]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) {'loss': 0.3041, 'grad_norm': 0.4261350596976193, 'learning_rate': 8.535533905932739e-06, 'num_tokens': -inf, 'epoch': 2.5} Final batch size: 1, sequence length: 6519 Attention mask shape: torch.Size([1, 1, 6519, 6519]) Position ids shape: torch.Size([1, 6519]) Input IDs shape: torch.Size([1, 6519]) Labels shape: torch.Size([1, 6519]) Final batch size: 1, sequence length: 5525 Attention mask shape: torch.Size([1, 1, 5525, 5525]) Position ids shape: torch.Size([1, 5525]) Input IDs shape: torch.Size([1, 5525]) Labels shape: torch.Size([1, 5525]) Final batch size: 1, sequence length: 10273 Attention mask shape: torch.Size([1, 1, 10273, 10273]) Position ids shape: torch.Size([1, 10273]) Input IDs shape: torch.Size([1, 10273]) Labels shape: torch.Size([1, 10273]) Final batch size: 1, sequence length: 12281 Attention mask shape: torch.Size([1, 1, 12281, 12281]) Position ids shape: torch.Size([1, 12281]) Input IDs shape: torch.Size([1, 12281]) Labels shape: torch.Size([1, 12281]) Final batch size: 1, sequence length: 12927 Attention mask shape: torch.Size([1, 1, 12927, 12927]) Position ids shape: torch.Size([1, 12927]) Input IDs shape: torch.Size([1, 12927]) Labels shape: torch.Size([1, 12927]) Final batch size: 1, sequence length: 13363 Attention mask shape: torch.Size([1, 1, 13363, 13363]) Position ids shape: torch.Size([1, 13363]) Input IDs shape: torch.Size([1, 13363]) Labels shape: torch.Size([1, 13363]) Final batch size: 1, sequence length: 13385 Attention mask shape: torch.Size([1, 1, 13385, 13385]) Position ids shape: torch.Size([1, 13385]) Input IDs shape: torch.Size([1, 13385]) Labels shape: torch.Size([1, 13385]) Final batch size: 1, sequence length: 10905 Attention mask shape: torch.Size([1, 1, 10905, 10905]) Position ids shape: torch.Size([1, 10905]) Input IDs shape: torch.Size([1, 10905]) Labels shape: torch.Size([1, 10905]) Final batch size: 1, sequence length: 10408 Attention mask shape: torch.Size([1, 1, 10408, 10408]) Position ids shape: torch.Size([1, 10408]) Input IDs shape: torch.Size([1, 10408]) Labels shape: torch.Size([1, 10408]) Final batch size: 1, sequence length: 13804 Attention mask shape: torch.Size([1, 1, 13804, 13804]) Position ids shape: torch.Size([1, 13804]) Input IDs shape: torch.Size([1, 13804]) Labels shape: torch.Size([1, 13804]) Final batch size: 1, sequence length: 9181 Attention mask shape: torch.Size([1, 1, 9181, 9181]) Position ids shape: torch.Size([1, 9181]) Input IDs shape: torch.Size([1, 9181]) Labels shape: torch.Size([1, 9181]) Final batch size: 1, sequence length: 15257 Attention mask shape: torch.Size([1, 1, 15257, 15257]) Position ids shape: torch.Size([1, 15257]) Input IDs shape: torch.Size([1, 15257]) Labels shape: torch.Size([1, 15257]) Final batch size: 1, sequence length: 17003 Attention mask shape: torch.Size([1, 1, 17003, 17003]) Position ids shape: torch.Size([1, 17003]) Input IDs shape: torch.Size([1, 17003]) Labels shape: torch.Size([1, 17003]) Final batch size: 1, sequence length: 15244 Attention mask shape: torch.Size([1, 1, 15244, 15244]) Position ids shape: torch.Size([1, 15244]) Input IDs shape: torch.Size([1, 15244]) Labels shape: torch.Size([1, 15244]) Final batch size: 1, sequence length: 15518 Attention mask shape: torch.Size([1, 1, 15518, 15518]) Position ids shape: torch.Size([1, 15518]) Input IDs shape: torch.Size([1, 15518]) Labels shape: torch.Size([1, 15518]) Final batch size: 1, sequence length: 16520 Attention mask shape: torch.Size([1, 1, 16520, 16520]) Position ids shape: torch.Size([1, 16520]) Input IDs shape: torch.Size([1, 16520]) Labels shape: torch.Size([1, 16520]) Final batch size: 1, sequence length: 20089 Attention mask shape: torch.Size([1, 1, 20089, 20089]) Position ids shape: torch.Size([1, 20089]) Input IDs shape: torch.Size([1, 20089]) Labels shape: torch.Size([1, 20089]) Final batch size: 1, sequence length: 20979 Attention mask shape: torch.Size([1, 1, 20979, 20979]) Position ids shape: torch.Size([1, 20979]) Input IDs shape: torch.Size([1, 20979]) Labels shape: torch.Size([1, 20979]) Final batch size: 1, sequence length: 19768 Attention mask shape: torch.Size([1, 1, 19768, 19768]) Position ids shape: torch.Size([1, 19768]) Input IDs shape: torch.Size([1, 19768]) Labels shape: torch.Size([1, 19768]) Final batch size: 1, sequence length: 18819 Attention mask shape: torch.Size([1, 1, 18819, 18819]) Position ids shape: torch.Size([1, 18819]) Input IDs shape: torch.Size([1, 18819]) Labels shape: torch.Size([1, 18819]) Final batch size: 1, sequence length: 18645 Attention mask shape: torch.Size([1, 1, 18645, 18645]) Position ids shape: torch.Size([1, 18645]) Input IDs shape: torch.Size([1, 18645]) Labels shape: torch.Size([1, 18645]) Final batch size: 1, sequence length: 8646 Attention mask shape: torch.Size([1, 1, 8646, 8646]) Position ids shape: torch.Size([1, 8646]) Input IDs shape: torch.Size([1, 8646]) Labels shape: torch.Size([1, 8646]) Final batch size: 1, sequence length: 14429 Attention mask shape: torch.Size([1, 1, 14429, 14429]) Position ids shape: torch.Size([1, 14429]) Input IDs shape: torch.Size([1, 14429]) Labels shape: torch.Size([1, 14429]) Final batch size: 1, sequence length: 22777 Attention mask shape: torch.Size([1, 1, 22777, 22777]) Position ids shape: torch.Size([1, 22777]) Input IDs shape: torch.Size([1, 22777]) Labels shape: torch.Size([1, 22777]) Final batch size: 1, sequence length: 23886 Attention mask shape: torch.Size([1, 1, 23886, 23886]) Position ids shape: torch.Size([1, 23886]) Input IDs shape: torch.Size([1, 23886]) Labels shape: torch.Size([1, 23886]) Final batch size: 1, sequence length: 18988 Attention mask shape: torch.Size([1, 1, 18988, 18988]) Position ids shape: torch.Size([1, 18988]) Input IDs shape: torch.Size([1, 18988]) Labels shape: torch.Size([1, 18988]) Final batch size: 1, sequence length: 22311 Attention mask shape: torch.Size([1, 1, 22311, 22311]) Position ids shape: torch.Size([1, 22311]) Input IDs shape: torch.Size([1, 22311]) Labels shape: torch.Size([1, 22311]) Final batch size: 1, sequence length: 19513 Attention mask shape: torch.Size([1, 1, 19513, 19513]) Position ids shape: torch.Size([1, 19513]) Input IDs shape: torch.Size([1, 19513]) Labels shape: torch.Size([1, 19513]) Final batch size: 1, sequence length: 25001 Attention mask shape: torch.Size([1, 1, 25001, 25001]) Position ids shape: torch.Size([1, 25001]) Input IDs shape: torch.Size([1, 25001]) Labels shape: torch.Size([1, 25001]) Final batch size: 1, sequence length: 22979 Attention mask shape: torch.Size([1, 1, 22979, 22979]) Position ids shape: torch.Size([1, 22979]) Input IDs shape: torch.Size([1, 22979]) Labels shape: torch.Size([1, 22979]) Final batch size: 1, sequence length: 20106 Attention mask shape: torch.Size([1, 1, 20106, 20106]) Position ids shape: torch.Size([1, 20106]) Input IDs shape: torch.Size([1, 20106]) Labels shape: torch.Size([1, 20106]) Final batch size: 1, sequence length: 23341 Attention mask shape: torch.Size([1, 1, 23341, 23341]) Position ids shape: torch.Size([1, 23341]) Input IDs shape: torch.Size([1, 23341]) Labels shape: torch.Size([1, 23341]) Final batch size: 1, sequence length: 23942 Attention mask shape: torch.Size([1, 1, 23942, 23942]) Position ids shape: torch.Size([1, 23942]) Input IDs shape: torch.Size([1, 23942]) Labels shape: torch.Size([1, 23942]) Final batch size: 1, sequence length: 22160 Attention mask shape: torch.Size([1, 1, 22160, 22160]) Position ids shape: torch.Size([1, 22160]) Input IDs shape: torch.Size([1, 22160]) Labels shape: torch.Size([1, 22160]) Final batch size: 1, sequence length: 23995 Attention mask shape: torch.Size([1, 1, 23995, 23995]) Position ids shape: torch.Size([1, 23995]) Input IDs shape: torch.Size([1, 23995]) Labels shape: torch.Size([1, 23995]) Final batch size: 1, sequence length: 19286 Attention mask shape: torch.Size([1, 1, 19286, 19286]) Position ids shape: torch.Size([1, 19286]) Input IDs shape: torch.Size([1, 19286]) Labels shape: torch.Size([1, 19286]) Final batch size: 1, sequence length: 24287 Attention mask shape: torch.Size([1, 1, 24287, 24287]) Position ids shape: torch.Size([1, 24287]) Input IDs shape: torch.Size([1, 24287]) Labels shape: torch.Size([1, 24287]) Final batch size: 1, sequence length: 25021 Attention mask shape: torch.Size([1, 1, 25021, 25021]) Position ids shape: torch.Size([1, 25021]) Input IDs shape: torch.Size([1, 25021]) Labels shape: torch.Size([1, 25021]) Final batch size: 1, sequence length: 5288 Attention mask shape: torch.Size([1, 1, 5288, 5288]) Position ids shape: torch.Size([1, 5288]) Input IDs shape: torch.Size([1, 5288]) Labels shape: torch.Size([1, 5288]) Final batch size: 1, sequence length: 21660 Attention mask shape: torch.Size([1, 1, 21660, 21660]) Position ids shape: torch.Size([1, 21660]) Input IDs shape: torch.Size([1, 21660]) Labels shape: torch.Size([1, 21660]) Final batch size: 1, sequence length: 13600 Attention mask shape: torch.Size([1, 1, 13600, 13600]) Position ids shape: torch.Size([1, 13600]) Input IDs shape: torch.Size([1, 13600]) Labels shape: torch.Size([1, 13600]) Final batch size: 1, sequence length: 22915 Attention mask shape: torch.Size([1, 1, 22915, 22915]) Position ids shape: torch.Size([1, 22915]) Input IDs shape: torch.Size([1, 22915]) Labels shape: torch.Size([1, 22915]) Final batch size: 1, sequence length: 23724 Attention mask shape: torch.Size([1, 1, 23724, 23724]) Position ids shape: torch.Size([1, 23724]) Input IDs shape: torch.Size([1, 23724]) Labels shape: torch.Size([1, 23724]) Final batch size: 1, sequence length: 24407 Attention mask shape: torch.Size([1, 1, 24407, 24407]) Position ids shape: torch.Size([1, 24407]) Input IDs shape: torch.Size([1, 24407]) Labels shape: torch.Size([1, 24407]) Final batch size: 1, sequence length: 24293 Attention mask shape: torch.Size([1, 1, 24293, 24293]) Position ids shape: torch.Size([1, 24293]) Input IDs shape: torch.Size([1, 24293]) Labels shape: torch.Size([1, 24293]) Final batch size: 1, sequence length: 5405 Attention mask shape: torch.Size([1, 1, 5405, 5405]) Position ids shape: torch.Size([1, 5405]) Input IDs shape: torch.Size([1, 5405]) Labels shape: torch.Size([1, 5405]) Final batch size: 1, sequence length: 11266 Attention mask shape: torch.Size([1, 1, 11266, 11266]) Position ids shape: torch.Size([1, 11266]) Input IDs shape: torch.Size([1, 11266]) Labels shape: torch.Size([1, 11266]) Final batch size: 1, sequence length: 26054 Attention mask shape: torch.Size([1, 1, 26054, 26054]) Position ids shape: torch.Size([1, 26054]) Input IDs shape: torch.Size([1, 26054]) Labels shape: torch.Size([1, 26054]) Final batch size: 1, sequence length: 26461 Attention mask shape: torch.Size([1, 1, 26461, 26461]) Position ids shape: torch.Size([1, 26461]) Input IDs shape: torch.Size([1, 26461]) Labels shape: torch.Size([1, 26461]) Final batch size: 1, sequence length: 12483 Attention mask shape: torch.Size([1, 1, 12483, 12483]) Position ids shape: torch.Size([1, 12483]) Input IDs shape: torch.Size([1, 12483]) Labels shape: torch.Size([1, 12483]) Final batch size: 1, sequence length: 26179 Attention mask shape: torch.Size([1, 1, 26179, 26179]) Position ids shape: torch.Size([1, 26179]) Input IDs shape: torch.Size([1, 26179]) Labels shape: torch.Size([1, 26179]) Final batch size: 1, sequence length: 29009 Attention mask shape: torch.Size([1, 1, 29009, 29009]) Position ids shape: torch.Size([1, 29009]) Input IDs shape: torch.Size([1, 29009]) Labels shape: torch.Size([1, 29009]) Final batch size: 1, sequence length: 30236 Attention mask shape: torch.Size([1, 1, 30236, 30236]) Position ids shape: torch.Size([1, 30236]) Input IDs shape: torch.Size([1, 30236]) Labels shape: torch.Size([1, 30236]) Final batch size: 1, sequence length: 30723 Attention mask shape: torch.Size([1, 1, 30723, 30723]) Position ids shape: torch.Size([1, 30723]) Input IDs shape: torch.Size([1, 30723]) Labels shape: torch.Size([1, 30723]) Final batch size: 1, sequence length: 28263 Attention mask shape: torch.Size([1, 1, 28263, 28263]) Position ids shape: torch.Size([1, 28263]) Input IDs shape: torch.Size([1, 28263]) Labels shape: torch.Size([1, 28263]) Final batch size: 1, sequence length: 17649 Attention mask shape: torch.Size([1, 1, 17649, 17649]) Position ids shape: torch.Size([1, 17649]) Input IDs shape: torch.Size([1, 17649]) Labels shape: torch.Size([1, 17649]) Final batch size: 1, sequence length: 29237 Attention mask shape: torch.Size([1, 1, 29237, 29237]) Position ids shape: torch.Size([1, 29237]) Input IDs shape: torch.Size([1, 29237]) Labels shape: torch.Size([1, 29237]) Final batch size: 1, sequence length: 17539 Attention mask shape: torch.Size([1, 1, 17539, 17539]) Position ids shape: torch.Size([1, 17539]) Input IDs shape: torch.Size([1, 17539]) Labels shape: torch.Size([1, 17539]) Final batch size: 1, sequence length: 28440 Attention mask shape: torch.Size([1, 1, 28440, 28440]) Position ids shape: torch.Size([1, 28440]) Input IDs shape: torch.Size([1, 28440]) Labels shape: torch.Size([1, 28440]) Final batch size: 1, sequence length: 20941 Attention mask shape: torch.Size([1, 1, 20941, 20941]) Position ids shape: torch.Size([1, 20941]) Input IDs shape: torch.Size([1, 20941]) Labels shape: torch.Size([1, 20941]) Final batch size: 1, sequence length: 17951 Attention mask shape: torch.Size([1, 1, 17951, 17951]) Position ids shape: torch.Size([1, 17951]) Input IDs shape: torch.Size([1, 17951]) Labels shape: torch.Size([1, 17951]) Final batch size: 1, sequence length: 32799 Attention mask shape: torch.Size([1, 1, 32799, 32799]) Position ids shape: torch.Size([1, 32799]) Input IDs shape: torch.Size([1, 32799]) Labels shape: torch.Size([1, 32799]) Final batch size: 1, sequence length: 26886 Attention mask shape: torch.Size([1, 1, 26886, 26886]) Position ids shape: torch.Size([1, 26886]) Input IDs shape: torch.Size([1, 26886]) Labels shape: torch.Size([1, 26886]) Final batch size: 1, sequence length: 32660 Attention mask shape: torch.Size([1, 1, 32660, 32660]) Position ids shape: torch.Size([1, 32660]) Input IDs shape: torch.Size([1, 32660]) Labels shape: torch.Size([1, 32660]) Final batch size: 1, sequence length: 32287 Attention mask shape: torch.Size([1, 1, 32287, 32287]) Position ids shape: torch.Size([1, 32287]) Input IDs shape: torch.Size([1, 32287]) Labels shape: torch.Size([1, 32287]) Final batch size: 1, sequence length: 21472 Attention mask shape: torch.Size([1, 1, 21472, 21472]) Position ids shape: torch.Size([1, 21472]) Input IDs shape: torch.Size([1, 21472]) Labels shape: torch.Size([1, 21472]) Final batch size: 1, sequence length: 29880 Attention mask shape: torch.Size([1, 1, 29880, 29880]) Position ids shape: torch.Size([1, 29880]) Input IDs shape: torch.Size([1, 29880]) Labels shape: torch.Size([1, 29880]) Final batch size: 1, sequence length: 29109 Attention mask shape: torch.Size([1, 1, 29109, 29109]) Position ids shape: torch.Size([1, 29109]) Input IDs shape: torch.Size([1, 29109]) Labels shape: torch.Size([1, 29109]) Final batch size: 1, sequence length: 27405 Attention mask shape: torch.Size([1, 1, 27405, 27405]) Position ids shape: torch.Size([1, 27405]) Input IDs shape: torch.Size([1, 27405]) Labels shape: torch.Size([1, 27405]) Final batch size: 1, sequence length: 34694 Attention mask shape: torch.Size([1, 1, 34694, 34694]) Position ids shape: torch.Size([1, 34694]) Input IDs shape: torch.Size([1, 34694]) Labels shape: torch.Size([1, 34694]) Final batch size: 1, sequence length: 15830 Attention mask shape: torch.Size([1, 1, 15830, 15830]) Position ids shape: torch.Size([1, 15830]) Input IDs shape: torch.Size([1, 15830]) Labels shape: torch.Size([1, 15830]) Final batch size: 1, sequence length: 32083 Attention mask shape: torch.Size([1, 1, 32083, 32083]) Position ids shape: torch.Size([1, 32083]) Input IDs shape: torch.Size([1, 32083]) Labels shape: torch.Size([1, 32083]) Final batch size: 1, sequence length: 31512 Attention mask shape: torch.Size([1, 1, 31512, 31512]) Position ids shape: torch.Size([1, 31512]) Input IDs shape: torch.Size([1, 31512]) Labels shape: torch.Size([1, 31512]) Final batch size: 1, sequence length: 29152 Attention mask shape: torch.Size([1, 1, 29152, 29152]) Position ids shape: torch.Size([1, 29152]) Input IDs shape: torch.Size([1, 29152]) Labels shape: torch.Size([1, 29152]) Final batch size: 1, sequence length: 29561 Attention mask shape: torch.Size([1, 1, 29561, 29561]) Position ids shape: torch.Size([1, 29561]) Input IDs shape: torch.Size([1, 29561]) Labels shape: torch.Size([1, 29561]) Final batch size: 1, sequence length: 26452 Attention mask shape: torch.Size([1, 1, 26452, 26452]) Position ids shape: torch.Size([1, 26452]) Input IDs shape: torch.Size([1, 26452]) Labels shape: torch.Size([1, 26452]) Final batch size: 1, sequence length: 8008 Attention mask shape: torch.Size([1, 1, 8008, 8008]) Position ids shape: torch.Size([1, 8008]) Input IDs shape: torch.Size([1, 8008]) Labels shape: torch.Size([1, 8008]) Final batch size: 1, sequence length: 35478 Attention mask shape: torch.Size([1, 1, 35478, 35478]) Position ids shape: torch.Size([1, 35478]) Input IDs shape: torch.Size([1, 35478]) Labels shape: torch.Size([1, 35478]) Final batch size: 1, sequence length: 27987 Attention mask shape: torch.Size([1, 1, 27987, 27987]) Position ids shape: torch.Size([1, 27987]) Input IDs shape: torch.Size([1, 27987]) Labels shape: torch.Size([1, 27987]) Final batch size: 1, sequence length: 15906 Attention mask shape: torch.Size([1, 1, 15906, 15906]) Position ids shape: torch.Size([1, 15906]) Input IDs shape: torch.Size([1, 15906]) Labels shape: torch.Size([1, 15906]) Final batch size: 1, sequence length: 36314 Attention mask shape: torch.Size([1, 1, 36314, 36314]) Position ids shape: torch.Size([1, 36314]) Input IDs shape: torch.Size([1, 36314]) Labels shape: torch.Size([1, 36314]) Final batch size: 1, sequence length: 35999 Attention mask shape: torch.Size([1, 1, 35999, 35999]) Position ids shape: torch.Size([1, 35999]) Input IDs shape: torch.Size([1, 35999]) Labels shape: torch.Size([1, 35999]) Final batch size: 1, sequence length: 7448 Attention mask shape: torch.Size([1, 1, 7448, 7448]) Position ids shape: torch.Size([1, 7448]) Input IDs shape: torch.Size([1, 7448]) Labels shape: torch.Size([1, 7448]) Final batch size: 1, sequence length: 17775 Attention mask shape: torch.Size([1, 1, 17775, 17775]) Position ids shape: torch.Size([1, 17775]) Input IDs shape: torch.Size([1, 17775]) Labels shape: torch.Size([1, 17775]) Final batch size: 1, sequence length: 23945 Attention mask shape: torch.Size([1, 1, 23945, 23945]) Position ids shape: torch.Size([1, 23945]) Input IDs shape: torch.Size([1, 23945]) Labels shape: torch.Size([1, 23945]) Final batch size: 1, sequence length: 24056 Attention mask shape: torch.Size([1, 1, 24056, 24056]) Position ids shape: torch.Size([1, 24056]) Input IDs shape: torch.Size([1, 24056]) Labels shape: torch.Size([1, 24056]) Final batch size: 1, sequence length: 35696 Attention mask shape: torch.Size([1, 1, 35696, 35696]) Position ids shape: torch.Size([1, 35696]) Input IDs shape: torch.Size([1, 35696]) Labels shape: torch.Size([1, 35696]) Final batch size: 1, sequence length: 29262 Attention mask shape: torch.Size([1, 1, 29262, 29262]) Position ids shape: torch.Size([1, 29262]) Input IDs shape: torch.Size([1, 29262]) Labels shape: torch.Size([1, 29262]) Final batch size: 1, sequence length: 31786 Attention mask shape: torch.Size([1, 1, 31786, 31786]) Position ids shape: torch.Size([1, 31786]) Input IDs shape: torch.Size([1, 31786]) Labels shape: torch.Size([1, 31786]) Final batch size: 1, sequence length: 16852 Attention mask shape: torch.Size([1, 1, 16852, 16852]) Position ids shape: torch.Size([1, 16852]) Input IDs shape: torch.Size([1, 16852]) Labels shape: torch.Size([1, 16852]) Final batch size: 1, sequence length: 28879 Attention mask shape: torch.Size([1, 1, 28879, 28879]) Position ids shape: torch.Size([1, 28879]) Input IDs shape: torch.Size([1, 28879]) Labels shape: torch.Size([1, 28879]) Final batch size: 1, sequence length: 39476 Attention mask shape: torch.Size([1, 1, 39476, 39476]) Position ids shape: torch.Size([1, 39476]) Input IDs shape: torch.Size([1, 39476]) Labels shape: torch.Size([1, 39476]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 16631 Attention mask shape: torch.Size([1, 1, 16631, 16631]) Position ids shape: torch.Size([1, 16631]) Input IDs shape: torch.Size([1, 16631]) Labels shape: torch.Size([1, 16631]) Final batch size: 1, sequence length: 6378 Attention mask shape: torch.Size([1, 1, 6378, 6378]) Position ids shape: torch.Size([1, 6378]) Input IDs shape: torch.Size([1, 6378]) Labels shape: torch.Size([1, 6378]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 31132 Attention mask shape: torch.Size([1, 1, 31132, 31132]) Position ids shape: torch.Size([1, 31132]) Input IDs shape: torch.Size([1, 31132]) Labels shape: torch.Size([1, 31132]) Final batch size: 1, sequence length: 34777 Attention mask shape: torch.Size([1, 1, 34777, 34777]) Position ids shape: torch.Size([1, 34777]) Input IDs shape: torch.Size([1, 34777]) Labels shape: torch.Size([1, 34777]) Final batch size: 1, sequence length: 35526 Attention mask shape: torch.Size([1, 1, 35526, 35526]) Position ids shape: torch.Size([1, 35526]) Input IDs shape: torch.Size([1, 35526]) Labels shape: torch.Size([1, 35526]) Final batch size: 1, sequence length: 10480 Attention mask shape: torch.Size([1, 1, 10480, 10480]) Position ids shape: torch.Size([1, 10480]) Input IDs shape: torch.Size([1, 10480]) Labels shape: torch.Size([1, 10480]) Final batch size: 1, sequence length: 28691 Attention mask shape: torch.Size([1, 1, 28691, 28691]) Position ids shape: torch.Size([1, 28691]) Input IDs shape: torch.Size([1, 28691]) Labels shape: torch.Size([1, 28691]) Final batch size: 1, sequence length: 11395 Attention mask shape: torch.Size([1, 1, 11395, 11395]) Position ids shape: torch.Size([1, 11395]) Input IDs shape: torch.Size([1, 11395]) Labels shape: torch.Size([1, 11395]) Final batch size: 1, sequence length: 37933 Attention mask shape: torch.Size([1, 1, 37933, 37933]) Position ids shape: torch.Size([1, 37933]) Input IDs shape: torch.Size([1, 37933]) Labels shape: torch.Size([1, 37933]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 28495 Attention mask shape: torch.Size([1, 1, 28495, 28495]) Position ids shape: torch.Size([1, 28495]) Input IDs shape: torch.Size([1, 28495]) Labels shape: torch.Size([1, 28495]) Final batch size: 1, sequence length: 24824 Attention mask shape: torch.Size([1, 1, 24824, 24824]) Position ids shape: torch.Size([1, 24824]) Input IDs shape: torch.Size([1, 24824]) Labels shape: torch.Size([1, 24824]) Final batch size: 1, sequence length: 20176 Attention mask shape: torch.Size([1, 1, 20176, 20176]) Position ids shape: torch.Size([1, 20176]) Input IDs shape: torch.Size([1, 20176]) Labels shape: torch.Size([1, 20176]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 20694 Attention mask shape: torch.Size([1, 1, 20694, 20694]) Position ids shape: torch.Size([1, 20694]) Input IDs shape: torch.Size([1, 20694]) Labels shape: torch.Size([1, 20694]) Final batch size: 1, sequence length: 18127 Attention mask shape: torch.Size([1, 1, 18127, 18127]) Position ids shape: torch.Size([1, 18127]) Input IDs shape: torch.Size([1, 18127]) Labels shape: torch.Size([1, 18127]) Final batch size: 1, sequence length: 38935 Attention mask shape: torch.Size([1, 1, 38935, 38935]) Position ids shape: torch.Size([1, 38935]) Input IDs shape: torch.Size([1, 38935]) Labels shape: torch.Size([1, 38935]) Final batch size: 1, sequence length: 18911 Attention mask shape: torch.Size([1, 1, 18911, 18911]) Position ids shape: torch.Size([1, 18911]) Input IDs shape: torch.Size([1, 18911]) Labels shape: torch.Size([1, 18911]) Final batch size: 1, sequence length: 30024 Attention mask shape: torch.Size([1, 1, 30024, 30024]) Position ids shape: torch.Size([1, 30024]) Input IDs shape: torch.Size([1, 30024]) Labels shape: torch.Size([1, 30024]) Final batch size: 1, sequence length: 14869 Attention mask shape: torch.Size([1, 1, 14869, 14869]) Position ids shape: torch.Size([1, 14869]) Input IDs shape: torch.Size([1, 14869]) Labels shape: torch.Size([1, 14869]) Final batch size: 1, sequence length: 40605 Attention mask shape: torch.Size([1, 1, 40605, 40605]) Position ids shape: torch.Size([1, 40605]) Input IDs shape: torch.Size([1, 40605]) Labels shape: torch.Size([1, 40605]) Final batch size: 1, sequence length: 37159 Attention mask shape: torch.Size([1, 1, 37159, 37159]) Position ids shape: torch.Size([1, 37159]) Input IDs shape: torch.Size([1, 37159]) Labels shape: torch.Size([1, 37159]) Final batch size: 1, sequence length: 35116 Attention mask shape: torch.Size([1, 1, 35116, 35116]) Position ids shape: torch.Size([1, 35116]) Input IDs shape: torch.Size([1, 35116]) Labels shape: torch.Size([1, 35116]) Final batch size: 1, sequence length: 39324 Attention mask shape: torch.Size([1, 1, 39324, 39324]) Position ids shape: torch.Size([1, 39324]) Input IDs shape: torch.Size([1, 39324]) Labels shape: torch.Size([1, 39324]) Final batch size: 1, sequence length: 36469 Attention mask shape: torch.Size([1, 1, 36469, 36469]) Position ids shape: torch.Size([1, 36469]) Input IDs shape: torch.Size([1, 36469]) Labels shape: torch.Size([1, 36469]) Final batch size: 1, sequence length: 28313 Attention mask shape: torch.Size([1, 1, 28313, 28313]) Position ids shape: torch.Size([1, 28313]) Input IDs shape: torch.Size([1, 28313]) Labels shape: torch.Size([1, 28313]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 30245 Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 30245, 30245]) Position ids shape: torch.Size([1, 30245]) Input IDs shape: torch.Size([1, 30245]) Labels shape: torch.Size([1, 30245]) Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 39687 Attention mask shape: torch.Size([1, 1, 39687, 39687]) Position ids shape: torch.Size([1, 39687]) Input IDs shape: torch.Size([1, 39687]) Labels shape: torch.Size([1, 39687]) Final batch size: 1, sequence length: 33070 Attention mask shape: torch.Size([1, 1, 33070, 33070]) Position ids shape: torch.Size([1, 33070]) Input IDs shape: torch.Size([1, 33070]) Labels shape: torch.Size([1, 33070]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 26402 Attention mask shape: torch.Size([1, 1, 26402, 26402]) Position ids shape: torch.Size([1, 26402]) Input IDs shape: torch.Size([1, 26402]) Labels shape: torch.Size([1, 26402]) {'loss': 0.2825, 'grad_norm': 0.37426502789365657, 'learning_rate': 8.345653031794292e-06, 'num_tokens': -inf, 'epoch': 2.62} Final batch size: 1, sequence length: 3968 Attention mask shape: torch.Size([1, 1, 3968, 3968]) Position ids shape: torch.Size([1, 3968]) Input IDs shape: torch.Size([1, 3968]) Labels shape: torch.Size([1, 3968]) Final batch size: 1, sequence length: 6567 Attention mask shape: torch.Size([1, 1, 6567, 6567]) Position ids shape: torch.Size([1, 6567]) Input IDs shape: torch.Size([1, 6567]) Labels shape: torch.Size([1, 6567]) Final batch size: 1, sequence length: 8347 Attention mask shape: torch.Size([1, 1, 8347, 8347]) Position ids shape: torch.Size([1, 8347]) Input IDs shape: torch.Size([1, 8347]) Labels shape: torch.Size([1, 8347]) Final batch size: 1, sequence length: 5997 Attention mask shape: torch.Size([1, 1, 5997, 5997]) Position ids shape: torch.Size([1, 5997]) Input IDs shape: torch.Size([1, 5997]) Labels shape: torch.Size([1, 5997]) Final batch size: 1, sequence length: 9027 Attention mask shape: torch.Size([1, 1, 9027, 9027]) Position ids shape: torch.Size([1, 9027]) Input IDs shape: torch.Size([1, 9027]) Labels shape: torch.Size([1, 9027]) Final batch size: 1, sequence length: 11232 Attention mask shape: torch.Size([1, 1, 11232, 11232]) Position ids shape: torch.Size([1, 11232]) Input IDs shape: torch.Size([1, 11232]) Labels shape: torch.Size([1, 11232]) Final batch size: 1, sequence length: 11880 Attention mask shape: torch.Size([1, 1, 11880, 11880]) Position ids shape: torch.Size([1, 11880]) Input IDs shape: torch.Size([1, 11880]) Labels shape: torch.Size([1, 11880]) Final batch size: 1, sequence length: 11054 Attention mask shape: torch.Size([1, 1, 11054, 11054]) Position ids shape: torch.Size([1, 11054]) Input IDs shape: torch.Size([1, 11054]) Labels shape: torch.Size([1, 11054]) Final batch size: 1, sequence length: 14758 Attention mask shape: torch.Size([1, 1, 14758, 14758]) Position ids shape: torch.Size([1, 14758]) Input IDs shape: torch.Size([1, 14758]) Labels shape: torch.Size([1, 14758]) Final batch size: 1, sequence length: 13260 Attention mask shape: torch.Size([1, 1, 13260, 13260]) Position ids shape: torch.Size([1, 13260]) Input IDs shape: torch.Size([1, 13260]) Labels shape: torch.Size([1, 13260]) Final batch size: 1, sequence length: 11752 Attention mask shape: torch.Size([1, 1, 11752, 11752]) Position ids shape: torch.Size([1, 11752]) Input IDs shape: torch.Size([1, 11752]) Labels shape: torch.Size([1, 11752]) Final batch size: 1, sequence length: 12370 Attention mask shape: torch.Size([1, 1, 12370, 12370]) Position ids shape: torch.Size([1, 12370]) Input IDs shape: torch.Size([1, 12370]) Labels shape: torch.Size([1, 12370]) Final batch size: 1, sequence length: 14512 Attention mask shape: torch.Size([1, 1, 14512, 14512]) Final batch size: 1, sequence length: 14597 Position ids shape: torch.Size([1, 14512]) Input IDs shape: torch.Size([1, 14512]) Labels shape: torch.Size([1, 14512]) Attention mask shape: torch.Size([1, 1, 14597, 14597]) Position ids shape: torch.Size([1, 14597]) Input IDs shape: torch.Size([1, 14597]) Labels shape: torch.Size([1, 14597]) Final batch size: 1, sequence length: 12967 Attention mask shape: torch.Size([1, 1, 12967, 12967]) Position ids shape: torch.Size([1, 12967]) Input IDs shape: torch.Size([1, 12967]) Labels shape: torch.Size([1, 12967]) Final batch size: 1, sequence length: 16014 Attention mask shape: torch.Size([1, 1, 16014, 16014]) Position ids shape: torch.Size([1, 16014]) Input IDs shape: torch.Size([1, 16014]) Labels shape: torch.Size([1, 16014]) Final batch size: 1, sequence length: 17166 Attention mask shape: torch.Size([1, 1, 17166, 17166]) Position ids shape: torch.Size([1, 17166]) Input IDs shape: torch.Size([1, 17166]) Labels shape: torch.Size([1, 17166]) Final batch size: 1, sequence length: 15438 Attention mask shape: torch.Size([1, 1, 15438, 15438]) Position ids shape: torch.Size([1, 15438]) Input IDs shape: torch.Size([1, 15438]) Labels shape: torch.Size([1, 15438]) Final batch size: 1, sequence length: 18050 Attention mask shape: torch.Size([1, 1, 18050, 18050]) Position ids shape: torch.Size([1, 18050]) Input IDs shape: torch.Size([1, 18050]) Labels shape: torch.Size([1, 18050]) Final batch size: 1, sequence length: 18264 Attention mask shape: torch.Size([1, 1, 18264, 18264]) Position ids shape: torch.Size([1, 18264]) Input IDs shape: torch.Size([1, 18264]) Labels shape: torch.Size([1, 18264]) Final batch size: 1, sequence length: 14520 Attention mask shape: torch.Size([1, 1, 14520, 14520]) Position ids shape: torch.Size([1, 14520]) Input IDs shape: torch.Size([1, 14520]) Labels shape: torch.Size([1, 14520]) Final batch size: 1, sequence length: 17092 Attention mask shape: torch.Size([1, 1, 17092, 17092]) Position ids shape: torch.Size([1, 17092]) Input IDs shape: torch.Size([1, 17092]) Labels shape: torch.Size([1, 17092]) Final batch size: 1, sequence length: 11206 Attention mask shape: torch.Size([1, 1, 11206, 11206]) Position ids shape: torch.Size([1, 11206]) Input IDs shape: torch.Size([1, 11206]) Labels shape: torch.Size([1, 11206]) Final batch size: 1, sequence length: 17839 Attention mask shape: torch.Size([1, 1, 17839, 17839]) Position ids shape: torch.Size([1, 17839]) Input IDs shape: torch.Size([1, 17839]) Labels shape: torch.Size([1, 17839]) Final batch size: 1, sequence length: 14335 Attention mask shape: torch.Size([1, 1, 14335, 14335]) Position ids shape: torch.Size([1, 14335]) Input IDs shape: torch.Size([1, 14335]) Labels shape: torch.Size([1, 14335]) Final batch size: 1, sequence length: 18131 Attention mask shape: torch.Size([1, 1, 18131, 18131]) Position ids shape: torch.Size([1, 18131]) Input IDs shape: torch.Size([1, 18131]) Labels shape: torch.Size([1, 18131]) Final batch size: 1, sequence length: 20947 Attention mask shape: torch.Size([1, 1, 20947, 20947]) Position ids shape: torch.Size([1, 20947]) Input IDs shape: torch.Size([1, 20947]) Labels shape: torch.Size([1, 20947]) Final batch size: 1, sequence length: 17194 Attention mask shape: torch.Size([1, 1, 17194, 17194]) Position ids shape: torch.Size([1, 17194]) Input IDs shape: torch.Size([1, 17194]) Labels shape: torch.Size([1, 17194]) Final batch size: 1, sequence length: 13061 Attention mask shape: torch.Size([1, 1, 13061, 13061]) Position ids shape: torch.Size([1, 13061]) Input IDs shape: torch.Size([1, 13061]) Labels shape: torch.Size([1, 13061]) Final batch size: 1, sequence length: 23324 Attention mask shape: torch.Size([1, 1, 23324, 23324]) Position ids shape: torch.Size([1, 23324]) Input IDs shape: torch.Size([1, 23324]) Labels shape: torch.Size([1, 23324]) Final batch size: 1, sequence length: 18950 Attention mask shape: torch.Size([1, 1, 18950, 18950]) Position ids shape: torch.Size([1, 18950]) Input IDs shape: torch.Size([1, 18950]) Labels shape: torch.Size([1, 18950]) Final batch size: 1, sequence length: 22107 Attention mask shape: torch.Size([1, 1, 22107, 22107]) Position ids shape: torch.Size([1, 22107]) Input IDs shape: torch.Size([1, 22107]) Labels shape: torch.Size([1, 22107]) Final batch size: 1, sequence length: 24566 Attention mask shape: torch.Size([1, 1, 24566, 24566]) Position ids shape: torch.Size([1, 24566]) Input IDs shape: torch.Size([1, 24566]) Labels shape: torch.Size([1, 24566]) Final batch size: 1, sequence length: 24432 Attention mask shape: torch.Size([1, 1, 24432, 24432]) Position ids shape: torch.Size([1, 24432]) Input IDs shape: torch.Size([1, 24432]) Labels shape: torch.Size([1, 24432]) Final batch size: 1, sequence length: 26068 Attention mask shape: torch.Size([1, 1, 26068, 26068]) Position ids shape: torch.Size([1, 26068]) Input IDs shape: torch.Size([1, 26068]) Labels shape: torch.Size([1, 26068]) Final batch size: 1, sequence length: 20487 Attention mask shape: torch.Size([1, 1, 20487, 20487]) Position ids shape: torch.Size([1, 20487]) Input IDs shape: torch.Size([1, 20487]) Labels shape: torch.Size([1, 20487]) Final batch size: 1, sequence length: 17115 Attention mask shape: torch.Size([1, 1, 17115, 17115]) Position ids shape: torch.Size([1, 17115]) Input IDs shape: torch.Size([1, 17115]) Labels shape: torch.Size([1, 17115]) Final batch size: 1, sequence length: 22547 Attention mask shape: torch.Size([1, 1, 22547, 22547]) Position ids shape: torch.Size([1, 22547]) Input IDs shape: torch.Size([1, 22547]) Labels shape: torch.Size([1, 22547]) Final batch size: 1, sequence length: 21137 Attention mask shape: torch.Size([1, 1, 21137, 21137]) Position ids shape: torch.Size([1, 21137]) Input IDs shape: torch.Size([1, 21137]) Labels shape: torch.Size([1, 21137]) Final batch size: 1, sequence length: 26271 Attention mask shape: torch.Size([1, 1, 26271, 26271]) Position ids shape: torch.Size([1, 26271]) Input IDs shape: torch.Size([1, 26271]) Labels shape: torch.Size([1, 26271]) Final batch size: 1, sequence length: 26333 Attention mask shape: torch.Size([1, 1, 26333, 26333]) Position ids shape: torch.Size([1, 26333]) Input IDs shape: torch.Size([1, 26333]) Labels shape: torch.Size([1, 26333]) Final batch size: 1, sequence length: 25252 Attention mask shape: torch.Size([1, 1, 25252, 25252]) Position ids shape: torch.Size([1, 25252]) Input IDs shape: torch.Size([1, 25252]) Labels shape: torch.Size([1, 25252]) Final batch size: 1, sequence length: 26937 Attention mask shape: torch.Size([1, 1, 26937, 26937]) Position ids shape: torch.Size([1, 26937]) Input IDs shape: torch.Size([1, 26937]) Labels shape: torch.Size([1, 26937]) Final batch size: 1, sequence length: 29236 Attention mask shape: torch.Size([1, 1, 29236, 29236]) Position ids shape: torch.Size([1, 29236]) Input IDs shape: torch.Size([1, 29236]) Labels shape: torch.Size([1, 29236]) Final batch size: 1, sequence length: 23960 Attention mask shape: torch.Size([1, 1, 23960, 23960]) Position ids shape: torch.Size([1, 23960]) Input IDs shape: torch.Size([1, 23960]) Labels shape: torch.Size([1, 23960]) Final batch size: 1, sequence length: 28858 Attention mask shape: torch.Size([1, 1, 28858, 28858]) Position ids shape: torch.Size([1, 28858]) Input IDs shape: torch.Size([1, 28858]) Labels shape: torch.Size([1, 28858]) Final batch size: 1, sequence length: 28284 Attention mask shape: torch.Size([1, 1, 28284, 28284]) Position ids shape: torch.Size([1, 28284]) Input IDs shape: torch.Size([1, 28284]) Labels shape: torch.Size([1, 28284]) Final batch size: 1, sequence length: 29768 Attention mask shape: torch.Size([1, 1, 29768, 29768]) Position ids shape: torch.Size([1, 29768]) Input IDs shape: torch.Size([1, 29768]) Labels shape: torch.Size([1, 29768]) Final batch size: 1, sequence length: 21728 Attention mask shape: torch.Size([1, 1, 21728, 21728]) Position ids shape: torch.Size([1, 21728]) Input IDs shape: torch.Size([1, 21728]) Labels shape: torch.Size([1, 21728]) Final batch size: 1, sequence length: 10286 Attention mask shape: torch.Size([1, 1, 10286, 10286]) Position ids shape: torch.Size([1, 10286]) Input IDs shape: torch.Size([1, 10286]) Labels shape: torch.Size([1, 10286]) Final batch size: 1, sequence length: 27278 Attention mask shape: torch.Size([1, 1, 27278, 27278]) Position ids shape: torch.Size([1, 27278]) Input IDs shape: torch.Size([1, 27278]) Labels shape: torch.Size([1, 27278]) Final batch size: 1, sequence length: 18051 Attention mask shape: torch.Size([1, 1, 18051, 18051]) Position ids shape: torch.Size([1, 18051]) Input IDs shape: torch.Size([1, 18051]) Labels shape: torch.Size([1, 18051]) Final batch size: 1, sequence length: 27293 Attention mask shape: torch.Size([1, 1, 27293, 27293]) Position ids shape: torch.Size([1, 27293]) Input IDs shape: torch.Size([1, 27293]) Labels shape: torch.Size([1, 27293]) Final batch size: 1, sequence length: 31377 Attention mask shape: torch.Size([1, 1, 31377, 31377]) Position ids shape: torch.Size([1, 31377]) Input IDs shape: torch.Size([1, 31377]) Labels shape: torch.Size([1, 31377]) Final batch size: 1, sequence length: 29478 Attention mask shape: torch.Size([1, 1, 29478, 29478]) Position ids shape: torch.Size([1, 29478]) Input IDs shape: torch.Size([1, 29478]) Labels shape: torch.Size([1, 29478]) Final batch size: 1, sequence length: 30862 Attention mask shape: torch.Size([1, 1, 30862, 30862]) Position ids shape: torch.Size([1, 30862]) Input IDs shape: torch.Size([1, 30862]) Labels shape: torch.Size([1, 30862]) Final batch size: 1, sequence length: 32392 Attention mask shape: torch.Size([1, 1, 32392, 32392]) Position ids shape: torch.Size([1, 32392]) Input IDs shape: torch.Size([1, 32392]) Labels shape: torch.Size([1, 32392]) Final batch size: 1, sequence length: 15229 Attention mask shape: torch.Size([1, 1, 15229, 15229]) Position ids shape: torch.Size([1, 15229]) Input IDs shape: torch.Size([1, 15229]) Labels shape: torch.Size([1, 15229]) Final batch size: 1, sequence length: 31860 Attention mask shape: torch.Size([1, 1, 31860, 31860]) Position ids shape: torch.Size([1, 31860]) Input IDs shape: torch.Size([1, 31860]) Labels shape: torch.Size([1, 31860]) Final batch size: 1, sequence length: 7364 Attention mask shape: torch.Size([1, 1, 7364, 7364]) Position ids shape: torch.Size([1, 7364]) Input IDs shape: torch.Size([1, 7364]) Labels shape: torch.Size([1, 7364]) Final batch size: 1, sequence length: 29592 Attention mask shape: torch.Size([1, 1, 29592, 29592]) Position ids shape: torch.Size([1, 29592]) Input IDs shape: torch.Size([1, 29592]) Labels shape: torch.Size([1, 29592]) Final batch size: 1, sequence length: 28421 Attention mask shape: torch.Size([1, 1, 28421, 28421]) Position ids shape: torch.Size([1, 28421]) Input IDs shape: torch.Size([1, 28421]) Labels shape: torch.Size([1, 28421]) Final batch size: 1, sequence length: 33736 Attention mask shape: torch.Size([1, 1, 33736, 33736]) Position ids shape: torch.Size([1, 33736]) Input IDs shape: torch.Size([1, 33736]) Labels shape: torch.Size([1, 33736]) Final batch size: 1, sequence length: 31740 Attention mask shape: torch.Size([1, 1, 31740, 31740]) Position ids shape: torch.Size([1, 31740]) Input IDs shape: torch.Size([1, 31740]) Labels shape: torch.Size([1, 31740]) Final batch size: 1, sequence length: 13257 Attention mask shape: torch.Size([1, 1, 13257, 13257]) Position ids shape: torch.Size([1, 13257]) Input IDs shape: torch.Size([1, 13257]) Labels shape: torch.Size([1, 13257]) Final batch size: 1, sequence length: 18408 Attention mask shape: torch.Size([1, 1, 18408, 18408]) Position ids shape: torch.Size([1, 18408]) Input IDs shape: torch.Size([1, 18408]) Labels shape: torch.Size([1, 18408]) Final batch size: 1, sequence length: 29489 Attention mask shape: torch.Size([1, 1, 29489, 29489]) Position ids shape: torch.Size([1, 29489]) Input IDs shape: torch.Size([1, 29489]) Labels shape: torch.Size([1, 29489]) Final batch size: 1, sequence length: 33097 Attention mask shape: torch.Size([1, 1, 33097, 33097]) Position ids shape: torch.Size([1, 33097]) Input IDs shape: torch.Size([1, 33097]) Labels shape: torch.Size([1, 33097]) Final batch size: 1, sequence length: 9974 Attention mask shape: torch.Size([1, 1, 9974, 9974]) Position ids shape: torch.Size([1, 9974]) Input IDs shape: torch.Size([1, 9974]) Labels shape: torch.Size([1, 9974]) Final batch size: 1, sequence length: 19888 Attention mask shape: torch.Size([1, 1, 19888, 19888]) Position ids shape: torch.Size([1, 19888]) Input IDs shape: torch.Size([1, 19888]) Labels shape: torch.Size([1, 19888]) Final batch size: 1, sequence length: 21500 Attention mask shape: torch.Size([1, 1, 21500, 21500]) Position ids shape: torch.Size([1, 21500]) Input IDs shape: torch.Size([1, 21500]) Labels shape: torch.Size([1, 21500]) Final batch size: 1, sequence length: 35171 Attention mask shape: torch.Size([1, 1, 35171, 35171]) Position ids shape: torch.Size([1, 35171]) Input IDs shape: torch.Size([1, 35171]) Labels shape: torch.Size([1, 35171]) Final batch size: 1, sequence length: 32181 Attention mask shape: torch.Size([1, 1, 32181, 32181]) Position ids shape: torch.Size([1, 32181]) Input IDs shape: torch.Size([1, 32181]) Labels shape: torch.Size([1, 32181]) Final batch size: 1, sequence length: 32162 Attention mask shape: torch.Size([1, 1, 32162, 32162]) Position ids shape: torch.Size([1, 32162]) Input IDs shape: torch.Size([1, 32162]) Labels shape: torch.Size([1, 32162]) Final batch size: 1, sequence length: 32112 Attention mask shape: torch.Size([1, 1, 32112, 32112]) Position ids shape: torch.Size([1, 32112]) Input IDs shape: torch.Size([1, 32112]) Labels shape: torch.Size([1, 32112]) Final batch size: 1, sequence length: 35596 Attention mask shape: torch.Size([1, 1, 35596, 35596]) Position ids shape: torch.Size([1, 35596]) Input IDs shape: torch.Size([1, 35596]) Labels shape: torch.Size([1, 35596]) Final batch size: 1, sequence length: 35069 Attention mask shape: torch.Size([1, 1, 35069, 35069]) Position ids shape: torch.Size([1, 35069]) Input IDs shape: torch.Size([1, 35069]) Labels shape: torch.Size([1, 35069]) Final batch size: 1, sequence length: 37623 Attention mask shape: torch.Size([1, 1, 37623, 37623]) Position ids shape: torch.Size([1, 37623]) Input IDs shape: torch.Size([1, 37623]) Labels shape: torch.Size([1, 37623]) Final batch size: 1, sequence length: 34514 Attention mask shape: torch.Size([1, 1, 34514, 34514]) Position ids shape: torch.Size([1, 34514]) Input IDs shape: torch.Size([1, 34514]) Labels shape: torch.Size([1, 34514]) Final batch size: 1, sequence length: 18711 Attention mask shape: torch.Size([1, 1, 18711, 18711]) Position ids shape: torch.Size([1, 18711]) Input IDs shape: torch.Size([1, 18711]) Labels shape: torch.Size([1, 18711]) Final batch size: 1, sequence length: 17777 Attention mask shape: torch.Size([1, 1, 17777, 17777]) Position ids shape: torch.Size([1, 17777]) Input IDs shape: torch.Size([1, 17777]) Labels shape: torch.Size([1, 17777]) Final batch size: 1, sequence length: 38790 Attention mask shape: torch.Size([1, 1, 38790, 38790]) Position ids shape: torch.Size([1, 38790]) Input IDs shape: torch.Size([1, 38790]) Labels shape: torch.Size([1, 38790]) Final batch size: 1, sequence length: 38351 Attention mask shape: torch.Size([1, 1, 38351, 38351]) Position ids shape: torch.Size([1, 38351]) Input IDs shape: torch.Size([1, 38351]) Labels shape: torch.Size([1, 38351]) Final batch size: 1, sequence length: 25484 Attention mask shape: torch.Size([1, 1, 25484, 25484]) Position ids shape: torch.Size([1, 25484]) Input IDs shape: torch.Size([1, 25484]) Labels shape: torch.Size([1, 25484]) Final batch size: 1, sequence length: 25386 Attention mask shape: torch.Size([1, 1, 25386, 25386]) Position ids shape: torch.Size([1, 25386]) Input IDs shape: torch.Size([1, 25386]) Labels shape: torch.Size([1, 25386]) Final batch size: 1, sequence length: 30051 Attention mask shape: torch.Size([1, 1, 30051, 30051]) Position ids shape: torch.Size([1, 30051]) Input IDs shape: torch.Size([1, 30051]) Labels shape: torch.Size([1, 30051]) Final batch size: 1, sequence length: 30302 Attention mask shape: torch.Size([1, 1, 30302, 30302]) Position ids shape: torch.Size([1, 30302]) Input IDs shape: torch.Size([1, 30302]) Labels shape: torch.Size([1, 30302]) Final batch size: 1, sequence length: 21408 Attention mask shape: torch.Size([1, 1, 21408, 21408]) Position ids shape: torch.Size([1, 21408]) Input IDs shape: torch.Size([1, 21408]) Labels shape: torch.Size([1, 21408]) Final batch size: 1, sequence length: 24061 Final batch size: 1, sequence length: 32979 Attention mask shape: torch.Size([1, 1, 24061, 24061]) Position ids shape: torch.Size([1, 24061]) Input IDs shape: torch.Size([1, 24061]) Labels shape: torch.Size([1, 24061]) Attention mask shape: torch.Size([1, 1, 32979, 32979]) Position ids shape: torch.Size([1, 32979]) Input IDs shape: torch.Size([1, 32979]) Labels shape: torch.Size([1, 32979]) Final batch size: 1, sequence length: 13557 Attention mask shape: torch.Size([1, 1, 13557, 13557]) Position ids shape: torch.Size([1, 13557]) Input IDs shape: torch.Size([1, 13557]) Labels shape: torch.Size([1, 13557]) Final batch size: 1, sequence length: 31679 Attention mask shape: torch.Size([1, 1, 31679, 31679]) Position ids shape: torch.Size([1, 31679]) Input IDs shape: torch.Size([1, 31679]) Labels shape: torch.Size([1, 31679]) Final batch size: 1, sequence length: 39754 Attention mask shape: torch.Size([1, 1, 39754, 39754]) Position ids shape: torch.Size([1, 39754]) Input IDs shape: torch.Size([1, 39754]) Labels shape: torch.Size([1, 39754]) Final batch size: 1, sequence length: 24943 Attention mask shape: torch.Size([1, 1, 24943, 24943]) Position ids shape: torch.Size([1, 24943]) Input IDs shape: torch.Size([1, 24943]) Labels shape: torch.Size([1, 24943]) Final batch size: 1, sequence length: 34871 Attention mask shape: torch.Size([1, 1, 34871, 34871]) Position ids shape: torch.Size([1, 34871]) Input IDs shape: torch.Size([1, 34871]) Labels shape: torch.Size([1, 34871]) Final batch size: 1, sequence length: 31513 Attention mask shape: torch.Size([1, 1, 31513, 31513]) Position ids shape: torch.Size([1, 31513]) Input IDs shape: torch.Size([1, 31513]) Labels shape: torch.Size([1, 31513]) Final batch size: 1, sequence length: 34170 Attention mask shape: torch.Size([1, 1, 34170, 34170]) Position ids shape: torch.Size([1, 34170]) Input IDs shape: torch.Size([1, 34170]) Labels shape: torch.Size([1, 34170]) Final batch size: 1, sequence length: 30639 Attention mask shape: torch.Size([1, 1, 30639, 30639]) Position ids shape: torch.Size([1, 30639]) Input IDs shape: torch.Size([1, 30639]) Labels shape: torch.Size([1, 30639]) Final batch size: 1, sequence length: 36535 Attention mask shape: torch.Size([1, 1, 36535, 36535]) Position ids shape: torch.Size([1, 36535]) Input IDs shape: torch.Size([1, 36535]) Labels shape: torch.Size([1, 36535]) Final batch size: 1, sequence length: 32327 Attention mask shape: torch.Size([1, 1, 32327, 32327]) Position ids shape: torch.Size([1, 32327]) Input IDs shape: torch.Size([1, 32327]) Labels shape: torch.Size([1, 32327]) Final batch size: 1, sequence length: 23243 Attention mask shape: torch.Size([1, 1, 23243, 23243]) Position ids shape: torch.Size([1, 23243]) Input IDs shape: torch.Size([1, 23243]) Labels shape: torch.Size([1, 23243]) Final batch size: 1, sequence length: 18106 Attention mask shape: torch.Size([1, 1, 18106, 18106]) Position ids shape: torch.Size([1, 18106]) Input IDs shape: torch.Size([1, 18106]) Labels shape: torch.Size([1, 18106]) Final batch size: 1, sequence length: 33807 Attention mask shape: torch.Size([1, 1, 33807, 33807]) Position ids shape: torch.Size([1, 33807]) Input IDs shape: torch.Size([1, 33807]) Labels shape: torch.Size([1, 33807]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 24840 Attention mask shape: torch.Size([1, 1, 24840, 24840]) Position ids shape: torch.Size([1, 24840]) Input IDs shape: torch.Size([1, 24840]) Labels shape: torch.Size([1, 24840]) Final batch size: 1, sequence length: 21825 Attention mask shape: torch.Size([1, 1, 21825, 21825]) Position ids shape: torch.Size([1, 21825]) Input IDs shape: torch.Size([1, 21825]) Labels shape: torch.Size([1, 21825]) Final batch size: 1, sequence length: 15697 Attention mask shape: torch.Size([1, 1, 15697, 15697]) Position ids shape: torch.Size([1, 15697]) Input IDs shape: torch.Size([1, 15697]) Labels shape: torch.Size([1, 15697]) Final batch size: 1, sequence length: 24910 Attention mask shape: torch.Size([1, 1, 24910, 24910]) Position ids shape: torch.Size([1, 24910]) Input IDs shape: torch.Size([1, 24910]) Labels shape: torch.Size([1, 24910]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 33883 Attention mask shape: torch.Size([1, 1, 33883, 33883]) Position ids shape: torch.Size([1, 33883]) Input IDs shape: torch.Size([1, 33883]) Labels shape: torch.Size([1, 33883]) Final batch size: 1, sequence length: 30361 Attention mask shape: torch.Size([1, 1, 30361, 30361]) Position ids shape: torch.Size([1, 30361]) Input IDs shape: torch.Size([1, 30361]) Labels shape: torch.Size([1, 30361]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 29691 Attention mask shape: torch.Size([1, 1, 29691, 29691]) Position ids shape: torch.Size([1, 29691]) Input IDs shape: torch.Size([1, 29691]) Labels shape: torch.Size([1, 29691]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 27228 Attention mask shape: torch.Size([1, 1, 27228, 27228]) Position ids shape: torch.Size([1, 27228]) Input IDs shape: torch.Size([1, 27228]) Labels shape: torch.Size([1, 27228]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 39483 Attention mask shape: torch.Size([1, 1, 39483, 39483]) Position ids shape: torch.Size([1, 39483]) Input IDs shape: torch.Size([1, 39483]) Labels shape: torch.Size([1, 39483]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 15180 Attention mask shape: torch.Size([1, 1, 15180, 15180]) Position ids shape: torch.Size([1, 15180]) Input IDs shape: torch.Size([1, 15180]) Labels shape: torch.Size([1, 15180]) Final batch size: 1, sequence length: 33549 Attention mask shape: torch.Size([1, 1, 33549, 33549]) Position ids shape: torch.Size([1, 33549]) Input IDs shape: torch.Size([1, 33549]) Labels shape: torch.Size([1, 33549]) Final batch size: 1, sequence length: 32351 Attention mask shape: torch.Size([1, 1, 32351, 32351]) Position ids shape: torch.Size([1, 32351]) Input IDs shape: torch.Size([1, 32351]) Labels shape: torch.Size([1, 32351]) Final batch size: 1, sequence length: 30419 Attention mask shape: torch.Size([1, 1, 30419, 30419]) Position ids shape: torch.Size([1, 30419]) Input IDs shape: torch.Size([1, 30419]) Labels shape: torch.Size([1, 30419]) Final batch size: 1, sequence length: 13064 Attention mask shape: torch.Size([1, 1, 13064, 13064]) Position ids shape: torch.Size([1, 13064]) Input IDs shape: torch.Size([1, 13064]) Labels shape: torch.Size([1, 13064]) Final batch size: 1, sequence length: 36532 Attention mask shape: torch.Size([1, 1, 36532, 36532]) Position ids shape: torch.Size([1, 36532]) Input IDs shape: torch.Size([1, 36532]) Labels shape: torch.Size([1, 36532]) {'loss': 0.2943, 'grad_norm': 0.3626885374478131, 'learning_rate': 8.146601955249187e-06, 'num_tokens': -inf, 'epoch': 2.75} Final batch size: 1, sequence length: 8845 Attention mask shape: torch.Size([1, 1, 8845, 8845]) Position ids shape: torch.Size([1, 8845]) Input IDs shape: torch.Size([1, 8845]) Labels shape: torch.Size([1, 8845]) Final batch size: 1, sequence length: 7235 Attention mask shape: torch.Size([1, 1, 7235, 7235]) Position ids shape: torch.Size([1, 7235]) Input IDs shape: torch.Size([1, 7235]) Labels shape: torch.Size([1, 7235]) Final batch size: 1, sequence length: 12215 Attention mask shape: torch.Size([1, 1, 12215, 12215]) Position ids shape: torch.Size([1, 12215]) Input IDs shape: torch.Size([1, 12215]) Labels shape: torch.Size([1, 12215]) Final batch size: 1, sequence length: 12830 Attention mask shape: torch.Size([1, 1, 12830, 12830]) Position ids shape: torch.Size([1, 12830]) Input IDs shape: torch.Size([1, 12830]) Labels shape: torch.Size([1, 12830]) Final batch size: 1, sequence length: 13575 Attention mask shape: torch.Size([1, 1, 13575, 13575]) Position ids shape: torch.Size([1, 13575]) Input IDs shape: torch.Size([1, 13575]) Labels shape: torch.Size([1, 13575]) Final batch size: 1, sequence length: 12562 Attention mask shape: torch.Size([1, 1, 12562, 12562]) Position ids shape: torch.Size([1, 12562]) Input IDs shape: torch.Size([1, 12562]) Labels shape: torch.Size([1, 12562]) Final batch size: 1, sequence length: 13665 Attention mask shape: torch.Size([1, 1, 13665, 13665]) Position ids shape: torch.Size([1, 13665]) Input IDs shape: torch.Size([1, 13665]) Labels shape: torch.Size([1, 13665]) Final batch size: 1, sequence length: 14689 Attention mask shape: torch.Size([1, 1, 14689, 14689]) Position ids shape: torch.Size([1, 14689]) Input IDs shape: torch.Size([1, 14689]) Labels shape: torch.Size([1, 14689]) Final batch size: 1, sequence length: 14833 Attention mask shape: torch.Size([1, 1, 14833, 14833]) Position ids shape: torch.Size([1, 14833]) Input IDs shape: torch.Size([1, 14833]) Labels shape: torch.Size([1, 14833]) Final batch size: 1, sequence length: 16145 Attention mask shape: torch.Size([1, 1, 16145, 16145]) Position ids shape: torch.Size([1, 16145]) Input IDs shape: torch.Size([1, 16145]) Labels shape: torch.Size([1, 16145]) Final batch size: 1, sequence length: 17026 Attention mask shape: torch.Size([1, 1, 17026, 17026]) Position ids shape: torch.Size([1, 17026]) Input IDs shape: torch.Size([1, 17026]) Labels shape: torch.Size([1, 17026]) Final batch size: 1, sequence length: 12960 Attention mask shape: torch.Size([1, 1, 12960, 12960]) Position ids shape: torch.Size([1, 12960]) Input IDs shape: torch.Size([1, 12960]) Labels shape: torch.Size([1, 12960]) Final batch size: 1, sequence length: 15673 Attention mask shape: torch.Size([1, 1, 15673, 15673]) Position ids shape: torch.Size([1, 15673]) Input IDs shape: torch.Size([1, 15673]) Labels shape: torch.Size([1, 15673]) Final batch size: 1, sequence length: 14482 Attention mask shape: torch.Size([1, 1, 14482, 14482]) Position ids shape: torch.Size([1, 14482]) Input IDs shape: torch.Size([1, 14482]) Labels shape: torch.Size([1, 14482]) Final batch size: 1, sequence length: 17512 Attention mask shape: torch.Size([1, 1, 17512, 17512]) Position ids shape: torch.Size([1, 17512]) Input IDs shape: torch.Size([1, 17512]) Labels shape: torch.Size([1, 17512]) Final batch size: 1, sequence length: 17016 Attention mask shape: torch.Size([1, 1, 17016, 17016]) Position ids shape: torch.Size([1, 17016]) Input IDs shape: torch.Size([1, 17016]) Labels shape: torch.Size([1, 17016]) Final batch size: 1, sequence length: 17988 Attention mask shape: torch.Size([1, 1, 17988, 17988]) Position ids shape: torch.Size([1, 17988]) Input IDs shape: torch.Size([1, 17988]) Labels shape: torch.Size([1, 17988]) Final batch size: 1, sequence length: 15816 Attention mask shape: torch.Size([1, 1, 15816, 15816]) Position ids shape: torch.Size([1, 15816]) Input IDs shape: torch.Size([1, 15816]) Labels shape: torch.Size([1, 15816]) Final batch size: 1, sequence length: 18922 Attention mask shape: torch.Size([1, 1, 18922, 18922]) Position ids shape: torch.Size([1, 18922]) Input IDs shape: torch.Size([1, 18922]) Labels shape: torch.Size([1, 18922]) Final batch size: 1, sequence length: 19409 Attention mask shape: torch.Size([1, 1, 19409, 19409]) Position ids shape: torch.Size([1, 19409]) Input IDs shape: torch.Size([1, 19409]) Labels shape: torch.Size([1, 19409]) Final batch size: 1, sequence length: 12622 Attention mask shape: torch.Size([1, 1, 12622, 12622]) Position ids shape: torch.Size([1, 12622]) Input IDs shape: torch.Size([1, 12622]) Labels shape: torch.Size([1, 12622]) Final batch size: 1, sequence length: 20630 Attention mask shape: torch.Size([1, 1, 20630, 20630]) Position ids shape: torch.Size([1, 20630]) Input IDs shape: torch.Size([1, 20630]) Labels shape: torch.Size([1, 20630]) Final batch size: 1, sequence length: 19724 Attention mask shape: torch.Size([1, 1, 19724, 19724]) Position ids shape: torch.Size([1, 19724]) Input IDs shape: torch.Size([1, 19724]) Labels shape: torch.Size([1, 19724]) Final batch size: 1, sequence length: 18978 Attention mask shape: torch.Size([1, 1, 18978, 18978]) Position ids shape: torch.Size([1, 18978]) Input IDs shape: torch.Size([1, 18978]) Labels shape: torch.Size([1, 18978]) Final batch size: 1, sequence length: 14577 Attention mask shape: torch.Size([1, 1, 14577, 14577]) Position ids shape: torch.Size([1, 14577]) Input IDs shape: torch.Size([1, 14577]) Labels shape: torch.Size([1, 14577]) Final batch size: 1, sequence length: 20389 Attention mask shape: torch.Size([1, 1, 20389, 20389]) Position ids shape: torch.Size([1, 20389]) Input IDs shape: torch.Size([1, 20389]) Labels shape: torch.Size([1, 20389]) Final batch size: 1, sequence length: 15222 Attention mask shape: torch.Size([1, 1, 15222, 15222]) Position ids shape: torch.Size([1, 15222]) Input IDs shape: torch.Size([1, 15222]) Labels shape: torch.Size([1, 15222]) Final batch size: 1, sequence length: 10494 Attention mask shape: torch.Size([1, 1, 10494, 10494]) Position ids shape: torch.Size([1, 10494]) Input IDs shape: torch.Size([1, 10494]) Labels shape: torch.Size([1, 10494]) Final batch size: 1, sequence length: 19225 Attention mask shape: torch.Size([1, 1, 19225, 19225]) Position ids shape: torch.Size([1, 19225]) Input IDs shape: torch.Size([1, 19225]) Labels shape: torch.Size([1, 19225]) Final batch size: 1, sequence length: 20991 Attention mask shape: torch.Size([1, 1, 20991, 20991]) Position ids shape: torch.Size([1, 20991]) Input IDs shape: torch.Size([1, 20991]) Labels shape: torch.Size([1, 20991]) Final batch size: 1, sequence length: 20784 Attention mask shape: torch.Size([1, 1, 20784, 20784]) Position ids shape: torch.Size([1, 20784]) Input IDs shape: torch.Size([1, 20784]) Labels shape: torch.Size([1, 20784]) Final batch size: 1, sequence length: 20585 Attention mask shape: torch.Size([1, 1, 20585, 20585]) Position ids shape: torch.Size([1, 20585]) Input IDs shape: torch.Size([1, 20585]) Labels shape: torch.Size([1, 20585]) Final batch size: 1, sequence length: 19325 Attention mask shape: torch.Size([1, 1, 19325, 19325]) Position ids shape: torch.Size([1, 19325]) Input IDs shape: torch.Size([1, 19325]) Labels shape: torch.Size([1, 19325]) Final batch size: 1, sequence length: 22949 Attention mask shape: torch.Size([1, 1, 22949, 22949]) Position ids shape: torch.Size([1, 22949]) Input IDs shape: torch.Size([1, 22949]) Labels shape: torch.Size([1, 22949]) Final batch size: 1, sequence length: 21598 Attention mask shape: torch.Size([1, 1, 21598, 21598]) Position ids shape: torch.Size([1, 21598]) Input IDs shape: torch.Size([1, 21598]) Labels shape: torch.Size([1, 21598]) Final batch size: 1, sequence length: 20465 Attention mask shape: torch.Size([1, 1, 20465, 20465]) Position ids shape: torch.Size([1, 20465]) Input IDs shape: torch.Size([1, 20465]) Labels shape: torch.Size([1, 20465]) Final batch size: 1, sequence length: 21075 Attention mask shape: torch.Size([1, 1, 21075, 21075]) Position ids shape: torch.Size([1, 21075]) Input IDs shape: torch.Size([1, 21075]) Labels shape: torch.Size([1, 21075]) Final batch size: 1, sequence length: 12969 Attention mask shape: torch.Size([1, 1, 12969, 12969]) Position ids shape: torch.Size([1, 12969]) Input IDs shape: torch.Size([1, 12969]) Labels shape: torch.Size([1, 12969]) Final batch size: 1, sequence length: 6618 Attention mask shape: torch.Size([1, 1, 6618, 6618]) Position ids shape: torch.Size([1, 6618]) Input IDs shape: torch.Size([1, 6618]) Labels shape: torch.Size([1, 6618]) Final batch size: 1, sequence length: 23293 Attention mask shape: torch.Size([1, 1, 23293, 23293]) Position ids shape: torch.Size([1, 23293]) Input IDs shape: torch.Size([1, 23293]) Labels shape: torch.Size([1, 23293]) Final batch size: 1, sequence length: 13072 Attention mask shape: torch.Size([1, 1, 13072, 13072]) Position ids shape: torch.Size([1, 13072]) Input IDs shape: torch.Size([1, 13072]) Labels shape: torch.Size([1, 13072]) Final batch size: 1, sequence length: 23805 Attention mask shape: torch.Size([1, 1, 23805, 23805]) Position ids shape: torch.Size([1, 23805]) Input IDs shape: torch.Size([1, 23805]) Labels shape: torch.Size([1, 23805]) Final batch size: 1, sequence length: 21907 Attention mask shape: torch.Size([1, 1, 21907, 21907]) Position ids shape: torch.Size([1, 21907]) Input IDs shape: torch.Size([1, 21907]) Labels shape: torch.Size([1, 21907]) Final batch size: 1, sequence length: 22778 Attention mask shape: torch.Size([1, 1, 22778, 22778]) Position ids shape: torch.Size([1, 22778]) Input IDs shape: torch.Size([1, 22778]) Labels shape: torch.Size([1, 22778]) Final batch size: 1, sequence length: 3010 Attention mask shape: torch.Size([1, 1, 3010, 3010]) Position ids shape: torch.Size([1, 3010]) Input IDs shape: torch.Size([1, 3010]) Labels shape: torch.Size([1, 3010]) Final batch size: 1, sequence length: 21322 Attention mask shape: torch.Size([1, 1, 21322, 21322]) Position ids shape: torch.Size([1, 21322]) Input IDs shape: torch.Size([1, 21322]) Labels shape: torch.Size([1, 21322]) Final batch size: 1, sequence length: 14548 Attention mask shape: torch.Size([1, 1, 14548, 14548]) Position ids shape: torch.Size([1, 14548]) Input IDs shape: torch.Size([1, 14548]) Labels shape: torch.Size([1, 14548]) Final batch size: 1, sequence length: 23437 Attention mask shape: torch.Size([1, 1, 23437, 23437]) Position ids shape: torch.Size([1, 23437]) Input IDs shape: torch.Size([1, 23437]) Labels shape: torch.Size([1, 23437]) Final batch size: 1, sequence length: 23755 Attention mask shape: torch.Size([1, 1, 23755, 23755]) Position ids shape: torch.Size([1, 23755]) Input IDs shape: torch.Size([1, 23755]) Labels shape: torch.Size([1, 23755]) Final batch size: 1, sequence length: 23442 Attention mask shape: torch.Size([1, 1, 23442, 23442]) Position ids shape: torch.Size([1, 23442]) Input IDs shape: torch.Size([1, 23442]) Labels shape: torch.Size([1, 23442]) Final batch size: 1, sequence length: 24107 Attention mask shape: torch.Size([1, 1, 24107, 24107]) Position ids shape: torch.Size([1, 24107]) Input IDs shape: torch.Size([1, 24107]) Labels shape: torch.Size([1, 24107]) Final batch size: 1, sequence length: 24111 Attention mask shape: torch.Size([1, 1, 24111, 24111]) Position ids shape: torch.Size([1, 24111]) Input IDs shape: torch.Size([1, 24111]) Labels shape: torch.Size([1, 24111]) Final batch size: 1, sequence length: 28474 Attention mask shape: torch.Size([1, 1, 28474, 28474]) Position ids shape: torch.Size([1, 28474]) Input IDs shape: torch.Size([1, 28474]) Labels shape: torch.Size([1, 28474]) Final batch size: 1, sequence length: 26424 Attention mask shape: torch.Size([1, 1, 26424, 26424]) Position ids shape: torch.Size([1, 26424]) Input IDs shape: torch.Size([1, 26424]) Labels shape: torch.Size([1, 26424]) Final batch size: 1, sequence length: 28012 Attention mask shape: torch.Size([1, 1, 28012, 28012]) Position ids shape: torch.Size([1, 28012]) Input IDs shape: torch.Size([1, 28012]) Labels shape: torch.Size([1, 28012]) Final batch size: 1, sequence length: 11662 Attention mask shape: torch.Size([1, 1, 11662, 11662]) Position ids shape: torch.Size([1, 11662]) Input IDs shape: torch.Size([1, 11662]) Labels shape: torch.Size([1, 11662]) Final batch size: 1, sequence length: 22851 Attention mask shape: torch.Size([1, 1, 22851, 22851]) Position ids shape: torch.Size([1, 22851]) Input IDs shape: torch.Size([1, 22851]) Labels shape: torch.Size([1, 22851]) Final batch size: 1, sequence length: 27520 Attention mask shape: torch.Size([1, 1, 27520, 27520]) Position ids shape: torch.Size([1, 27520]) Input IDs shape: torch.Size([1, 27520]) Labels shape: torch.Size([1, 27520]) Final batch size: 1, sequence length: 27384 Attention mask shape: torch.Size([1, 1, 27384, 27384]) Position ids shape: torch.Size([1, 27384]) Input IDs shape: torch.Size([1, 27384]) Labels shape: torch.Size([1, 27384]) Final batch size: 1, sequence length: 26500 Attention mask shape: torch.Size([1, 1, 26500, 26500]) Position ids shape: torch.Size([1, 26500]) Input IDs shape: torch.Size([1, 26500]) Labels shape: torch.Size([1, 26500]) Final batch size: 1, sequence length: 25934 Attention mask shape: torch.Size([1, 1, 25934, 25934]) Position ids shape: torch.Size([1, 25934]) Input IDs shape: torch.Size([1, 25934]) Labels shape: torch.Size([1, 25934]) Final batch size: 1, sequence length: 14137 Attention mask shape: torch.Size([1, 1, 14137, 14137]) Position ids shape: torch.Size([1, 14137]) Input IDs shape: torch.Size([1, 14137]) Labels shape: torch.Size([1, 14137]) Final batch size: 1, sequence length: 21853 Attention mask shape: torch.Size([1, 1, 21853, 21853]) Position ids shape: torch.Size([1, 21853]) Input IDs shape: torch.Size([1, 21853]) Labels shape: torch.Size([1, 21853]) Final batch size: 1, sequence length: 19027 Attention mask shape: torch.Size([1, 1, 19027, 19027]) Position ids shape: torch.Size([1, 19027]) Input IDs shape: torch.Size([1, 19027]) Labels shape: torch.Size([1, 19027]) Final batch size: 1, sequence length: 30235 Attention mask shape: torch.Size([1, 1, 30235, 30235]) Position ids shape: torch.Size([1, 30235]) Input IDs shape: torch.Size([1, 30235]) Labels shape: torch.Size([1, 30235]) Final batch size: 1, sequence length: 24716 Attention mask shape: torch.Size([1, 1, 24716, 24716]) Position ids shape: torch.Size([1, 24716]) Input IDs shape: torch.Size([1, 24716]) Labels shape: torch.Size([1, 24716]) Final batch size: 1, sequence length: 20028 Attention mask shape: torch.Size([1, 1, 20028, 20028]) Position ids shape: torch.Size([1, 20028]) Input IDs shape: torch.Size([1, 20028]) Labels shape: torch.Size([1, 20028]) Final batch size: 1, sequence length: 24364 Attention mask shape: torch.Size([1, 1, 24364, 24364]) Position ids shape: torch.Size([1, 24364]) Input IDs shape: torch.Size([1, 24364]) Labels shape: torch.Size([1, 24364]) Final batch size: 1, sequence length: 20314 Attention mask shape: torch.Size([1, 1, 20314, 20314]) Position ids shape: torch.Size([1, 20314]) Input IDs shape: torch.Size([1, 20314]) Labels shape: torch.Size([1, 20314]) Final batch size: 1, sequence length: 12234 Attention mask shape: torch.Size([1, 1, 12234, 12234]) Position ids shape: torch.Size([1, 12234]) Input IDs shape: torch.Size([1, 12234]) Labels shape: torch.Size([1, 12234]) Final batch size: 1, sequence length: 29842 Attention mask shape: torch.Size([1, 1, 29842, 29842]) Position ids shape: torch.Size([1, 29842]) Input IDs shape: torch.Size([1, 29842]) Labels shape: torch.Size([1, 29842]) Final batch size: 1, sequence length: 26454 Attention mask shape: torch.Size([1, 1, 26454, 26454]) Position ids shape: torch.Size([1, 26454]) Input IDs shape: torch.Size([1, 26454]) Labels shape: torch.Size([1, 26454]) Final batch size: 1, sequence length: 31404 Attention mask shape: torch.Size([1, 1, 31404, 31404]) Position ids shape: torch.Size([1, 31404]) Input IDs shape: torch.Size([1, 31404]) Labels shape: torch.Size([1, 31404]) Final batch size: 1, sequence length: 34169 Attention mask shape: torch.Size([1, 1, 34169, 34169]) Position ids shape: torch.Size([1, 34169]) Input IDs shape: torch.Size([1, 34169]) Labels shape: torch.Size([1, 34169]) Final batch size: 1, sequence length: 23855 Attention mask shape: torch.Size([1, 1, 23855, 23855]) Position ids shape: torch.Size([1, 23855]) Input IDs shape: torch.Size([1, 23855]) Labels shape: torch.Size([1, 23855]) Final batch size: 1, sequence length: 30041 Attention mask shape: torch.Size([1, 1, 30041, 30041]) Position ids shape: torch.Size([1, 30041]) Input IDs shape: torch.Size([1, 30041]) Labels shape: torch.Size([1, 30041]) Final batch size: 1, sequence length: 33765 Attention mask shape: torch.Size([1, 1, 33765, 33765]) Position ids shape: torch.Size([1, 33765]) Input IDs shape: torch.Size([1, 33765]) Labels shape: torch.Size([1, 33765]) Final batch size: 1, sequence length: 12519 Attention mask shape: torch.Size([1, 1, 12519, 12519]) Position ids shape: torch.Size([1, 12519]) Input IDs shape: torch.Size([1, 12519]) Labels shape: torch.Size([1, 12519]) Final batch size: 1, sequence length: 24830 Attention mask shape: torch.Size([1, 1, 24830, 24830]) Position ids shape: torch.Size([1, 24830]) Input IDs shape: torch.Size([1, 24830]) Labels shape: torch.Size([1, 24830]) Final batch size: 1, sequence length: 21168 Attention mask shape: torch.Size([1, 1, 21168, 21168]) Position ids shape: torch.Size([1, 21168]) Input IDs shape: torch.Size([1, 21168]) Labels shape: torch.Size([1, 21168]) Final batch size: 1, sequence length: 27107 Attention mask shape: torch.Size([1, 1, 27107, 27107]) Position ids shape: torch.Size([1, 27107]) Input IDs shape: torch.Size([1, 27107]) Labels shape: torch.Size([1, 27107]) Final batch size: 1, sequence length: 23695 Attention mask shape: torch.Size([1, 1, 23695, 23695]) Position ids shape: torch.Size([1, 23695]) Input IDs shape: torch.Size([1, 23695]) Labels shape: torch.Size([1, 23695]) Final batch size: 1, sequence length: 29385 Attention mask shape: torch.Size([1, 1, 29385, 29385]) Position ids shape: torch.Size([1, 29385]) Input IDs shape: torch.Size([1, 29385]) Labels shape: torch.Size([1, 29385]) Final batch size: 1, sequence length: 33179 Attention mask shape: torch.Size([1, 1, 33179, 33179]) Position ids shape: torch.Size([1, 33179]) Input IDs shape: torch.Size([1, 33179]) Labels shape: torch.Size([1, 33179]) Final batch size: 1, sequence length: 22920 Attention mask shape: torch.Size([1, 1, 22920, 22920]) Position ids shape: torch.Size([1, 22920]) Input IDs shape: torch.Size([1, 22920]) Labels shape: torch.Size([1, 22920]) Final batch size: 1, sequence length: 19302 Attention mask shape: torch.Size([1, 1, 19302, 19302]) Position ids shape: torch.Size([1, 19302]) Input IDs shape: torch.Size([1, 19302]) Labels shape: torch.Size([1, 19302]) Final batch size: 1, sequence length: 17763 Attention mask shape: torch.Size([1, 1, 17763, 17763]) Position ids shape: torch.Size([1, 17763]) Input IDs shape: torch.Size([1, 17763]) Labels shape: torch.Size([1, 17763]) Final batch size: 1, sequence length: 29743 Attention mask shape: torch.Size([1, 1, 29743, 29743]) Position ids shape: torch.Size([1, 29743]) Input IDs shape: torch.Size([1, 29743]) Labels shape: torch.Size([1, 29743]) Final batch size: 1, sequence length: 38576 Attention mask shape: torch.Size([1, 1, 38576, 38576]) Position ids shape: torch.Size([1, 38576]) Input IDs shape: torch.Size([1, 38576]) Labels shape: torch.Size([1, 38576]) Final batch size: 1, sequence length: 24187 Attention mask shape: torch.Size([1, 1, 24187, 24187]) Position ids shape: torch.Size([1, 24187]) Input IDs shape: torch.Size([1, 24187]) Labels shape: torch.Size([1, 24187]) Final batch size: 1, sequence length: 10139 Attention mask shape: torch.Size([1, 1, 10139, 10139]) Position ids shape: torch.Size([1, 10139]) Input IDs shape: torch.Size([1, 10139]) Labels shape: torch.Size([1, 10139]) Final batch size: 1, sequence length: 33321 Attention mask shape: torch.Size([1, 1, 33321, 33321]) Position ids shape: torch.Size([1, 33321]) Input IDs shape: torch.Size([1, 33321]) Labels shape: torch.Size([1, 33321]) Final batch size: 1, sequence length: 27738 Attention mask shape: torch.Size([1, 1, 27738, 27738]) Position ids shape: torch.Size([1, 27738]) Input IDs shape: torch.Size([1, 27738]) Labels shape: torch.Size([1, 27738]) Final batch size: 1, sequence length: 37657 Attention mask shape: torch.Size([1, 1, 37657, 37657]) Position ids shape: torch.Size([1, 37657]) Input IDs shape: torch.Size([1, 37657]) Labels shape: torch.Size([1, 37657]) Final batch size: 1, sequence length: 24923 Attention mask shape: torch.Size([1, 1, 24923, 24923]) Position ids shape: torch.Size([1, 24923]) Input IDs shape: torch.Size([1, 24923]) Labels shape: torch.Size([1, 24923]) Final batch size: 1, sequence length: 40006 Attention mask shape: torch.Size([1, 1, 40006, 40006]) Position ids shape: torch.Size([1, 40006]) Input IDs shape: torch.Size([1, 40006]) Labels shape: torch.Size([1, 40006]) Final batch size: 1, sequence length: 26696 Attention mask shape: torch.Size([1, 1, 26696, 26696]) Position ids shape: torch.Size([1, 26696]) Input IDs shape: torch.Size([1, 26696]) Labels shape: torch.Size([1, 26696]) Final batch size: 1, sequence length: 26685 Attention mask shape: torch.Size([1, 1, 26685, 26685]) Position ids shape: torch.Size([1, 26685]) Input IDs shape: torch.Size([1, 26685]) Labels shape: torch.Size([1, 26685]) Final batch size: 1, sequence length: 35555 Attention mask shape: torch.Size([1, 1, 35555, 35555]) Position ids shape: torch.Size([1, 35555]) Input IDs shape: torch.Size([1, 35555]) Labels shape: torch.Size([1, 35555]) Final batch size: 1, sequence length: 32217 Attention mask shape: torch.Size([1, 1, 32217, 32217]) Position ids shape: torch.Size([1, 32217]) Input IDs shape: torch.Size([1, 32217]) Labels shape: torch.Size([1, 32217]) Final batch size: 1, sequence length: 18734 Attention mask shape: torch.Size([1, 1, 18734, 18734]) Position ids shape: torch.Size([1, 18734]) Input IDs shape: torch.Size([1, 18734]) Labels shape: torch.Size([1, 18734]) Final batch size: 1, sequence length: 35949 Attention mask shape: torch.Size([1, 1, 35949, 35949]) Position ids shape: torch.Size([1, 35949]) Input IDs shape: torch.Size([1, 35949]) Labels shape: torch.Size([1, 35949]) Final batch size: 1, sequence length: 29085 Attention mask shape: torch.Size([1, 1, 29085, 29085]) Position ids shape: torch.Size([1, 29085]) Input IDs shape: torch.Size([1, 29085]) Labels shape: torch.Size([1, 29085]) Final batch size: 1, sequence length: 29286 Attention mask shape: torch.Size([1, 1, 29286, 29286]) Position ids shape: torch.Size([1, 29286]) Input IDs shape: torch.Size([1, 29286]) Labels shape: torch.Size([1, 29286]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 39018 Attention mask shape: torch.Size([1, 1, 39018, 39018]) Position ids shape: torch.Size([1, 39018]) Input IDs shape: torch.Size([1, 39018]) Labels shape: torch.Size([1, 39018]) Final batch size: 1, sequence length: 23848 Attention mask shape: torch.Size([1, 1, 23848, 23848]) Position ids shape: torch.Size([1, 23848]) Input IDs shape: torch.Size([1, 23848]) Labels shape: torch.Size([1, 23848]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 32762 Attention mask shape: torch.Size([1, 1, 32762, 32762]) Position ids shape: torch.Size([1, 32762]) Input IDs shape: torch.Size([1, 32762]) Labels shape: torch.Size([1, 32762]) Final batch size: 1, sequence length: 31991 Attention mask shape: torch.Size([1, 1, 31991, 31991]) Position ids shape: torch.Size([1, 31991]) Input IDs shape: torch.Size([1, 31991]) Labels shape: torch.Size([1, 31991]) Final batch size: 1, sequence length: 31115 Attention mask shape: torch.Size([1, 1, 31115, 31115]) Position ids shape: torch.Size([1, 31115]) Input IDs shape: torch.Size([1, 31115]) Labels shape: torch.Size([1, 31115]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 22500 Attention mask shape: torch.Size([1, 1, 22500, 22500]) Position ids shape: torch.Size([1, 22500]) Input IDs shape: torch.Size([1, 22500]) Labels shape: torch.Size([1, 22500]) Final batch size: 1, sequence length: 19138 Attention mask shape: torch.Size([1, 1, 19138, 19138]) Position ids shape: torch.Size([1, 19138]) Input IDs shape: torch.Size([1, 19138]) Labels shape: torch.Size([1, 19138]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 25990 Attention mask shape: torch.Size([1, 1, 25990, 25990]) Position ids shape: torch.Size([1, 25990]) Input IDs shape: torch.Size([1, 25990]) Labels shape: torch.Size([1, 25990]) Final batch size: 1, sequence length: 34628 Attention mask shape: torch.Size([1, 1, 34628, 34628]) Position ids shape: torch.Size([1, 34628]) Input IDs shape: torch.Size([1, 34628]) Labels shape: torch.Size([1, 34628]) Final batch size: 1, sequence length: 40834 Attention mask shape: torch.Size([1, 1, 40834, 40834]) Position ids shape: torch.Size([1, 40834]) Input IDs shape: torch.Size([1, 40834]) Labels shape: torch.Size([1, 40834]) Final batch size: 1, sequence length: 22778 Attention mask shape: torch.Size([1, 1, 22778, 22778]) Position ids shape: torch.Size([1, 22778]) Input IDs shape: torch.Size([1, 22778]) Labels shape: torch.Size([1, 22778]) Final batch size: 1, sequence length: 32675 Attention mask shape: torch.Size([1, 1, 32675, 32675]) Position ids shape: torch.Size([1, 32675]) Input IDs shape: torch.Size([1, 32675]) Labels shape: torch.Size([1, 32675]) Final batch size: 1, sequence length: 26372 Attention mask shape: torch.Size([1, 1, 26372, 26372]) Position ids shape: torch.Size([1, 26372]) Input IDs shape: torch.Size([1, 26372]) Labels shape: torch.Size([1, 26372]) Final batch size: 1, sequence length: 37680 Attention mask shape: torch.Size([1, 1, 37680, 37680]) Position ids shape: torch.Size([1, 37680]) Input IDs shape: torch.Size([1, 37680]) Labels shape: torch.Size([1, 37680]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 34791 Attention mask shape: torch.Size([1, 1, 34791, 34791]) Position ids shape: torch.Size([1, 34791]) Input IDs shape: torch.Size([1, 34791]) Labels shape: torch.Size([1, 34791]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) {'loss': 0.2866, 'grad_norm': 0.33088050444946376, 'learning_rate': 7.938926261462366e-06, 'num_tokens': -inf, 'epoch': 2.88} Final batch size: 1, sequence length: 5686 Attention mask shape: torch.Size([1, 1, 5686, 5686]) Position ids shape: torch.Size([1, 5686]) Input IDs shape: torch.Size([1, 5686]) Labels shape: torch.Size([1, 5686]) Final batch size: 1, sequence length: 6986 Attention mask shape: torch.Size([1, 1, 6986, 6986]) Position ids shape: torch.Size([1, 6986]) Input IDs shape: torch.Size([1, 6986]) Labels shape: torch.Size([1, 6986]) Final batch size: 1, sequence length: 11304 Attention mask shape: torch.Size([1, 1, 11304, 11304]) Position ids shape: torch.Size([1, 11304]) Input IDs shape: torch.Size([1, 11304]) Labels shape: torch.Size([1, 11304]) Final batch size: 1, sequence length: 12847 Attention mask shape: torch.Size([1, 1, 12847, 12847]) Position ids shape: torch.Size([1, 12847]) Input IDs shape: torch.Size([1, 12847]) Labels shape: torch.Size([1, 12847]) Final batch size: 1, sequence length: 13224 Attention mask shape: torch.Size([1, 1, 13224, 13224]) Position ids shape: torch.Size([1, 13224]) Input IDs shape: torch.Size([1, 13224]) Labels shape: torch.Size([1, 13224]) Final batch size: 1, sequence length: 14168 Attention mask shape: torch.Size([1, 1, 14168, 14168]) Position ids shape: torch.Size([1, 14168]) Input IDs shape: torch.Size([1, 14168]) Labels shape: torch.Size([1, 14168]) Final batch size: 1, sequence length: 14304 Attention mask shape: torch.Size([1, 1, 14304, 14304]) Position ids shape: torch.Size([1, 14304]) Input IDs shape: torch.Size([1, 14304]) Labels shape: torch.Size([1, 14304]) Final batch size: 1, sequence length: 15621 Attention mask shape: torch.Size([1, 1, 15621, 15621]) Position ids shape: torch.Size([1, 15621]) Input IDs shape: torch.Size([1, 15621]) Labels shape: torch.Size([1, 15621]) Final batch size: 1, sequence length: 16440 Attention mask shape: torch.Size([1, 1, 16440, 16440]) Position ids shape: torch.Size([1, 16440]) Input IDs shape: torch.Size([1, 16440]) Labels shape: torch.Size([1, 16440]) Final batch size: 1, sequence length: 15652 Attention mask shape: torch.Size([1, 1, 15652, 15652]) Position ids shape: torch.Size([1, 15652]) Input IDs shape: torch.Size([1, 15652]) Labels shape: torch.Size([1, 15652]) Final batch size: 1, sequence length: 13872 Attention mask shape: torch.Size([1, 1, 13872, 13872]) Position ids shape: torch.Size([1, 13872]) Input IDs shape: torch.Size([1, 13872]) Labels shape: torch.Size([1, 13872]) Final batch size: 1, sequence length: 18246 Attention mask shape: torch.Size([1, 1, 18246, 18246]) Position ids shape: torch.Size([1, 18246]) Input IDs shape: torch.Size([1, 18246]) Labels shape: torch.Size([1, 18246]) Final batch size: 1, sequence length: 17773 Attention mask shape: torch.Size([1, 1, 17773, 17773]) Position ids shape: torch.Size([1, 17773]) Input IDs shape: torch.Size([1, 17773]) Labels shape: torch.Size([1, 17773]) Final batch size: 1, sequence length: 17213 Attention mask shape: torch.Size([1, 1, 17213, 17213]) Position ids shape: torch.Size([1, 17213]) Input IDs shape: torch.Size([1, 17213]) Labels shape: torch.Size([1, 17213]) Final batch size: 1, sequence length: 16378 Attention mask shape: torch.Size([1, 1, 16378, 16378]) Position ids shape: torch.Size([1, 16378]) Input IDs shape: torch.Size([1, 16378]) Labels shape: torch.Size([1, 16378]) Final batch size: 1, sequence length: 16514 Attention mask shape: torch.Size([1, 1, 16514, 16514]) Position ids shape: torch.Size([1, 16514]) Input IDs shape: torch.Size([1, 16514]) Labels shape: torch.Size([1, 16514]) Final batch size: 1, sequence length: 21163 Attention mask shape: torch.Size([1, 1, 21163, 21163]) Position ids shape: torch.Size([1, 21163]) Input IDs shape: torch.Size([1, 21163]) Labels shape: torch.Size([1, 21163]) Final batch size: 1, sequence length: 18044 Attention mask shape: torch.Size([1, 1, 18044, 18044]) Position ids shape: torch.Size([1, 18044]) Input IDs shape: torch.Size([1, 18044]) Labels shape: torch.Size([1, 18044]) Final batch size: 1, sequence length: 18155 Attention mask shape: torch.Size([1, 1, 18155, 18155]) Position ids shape: torch.Size([1, 18155]) Input IDs shape: torch.Size([1, 18155]) Labels shape: torch.Size([1, 18155]) Final batch size: 1, sequence length: 18289 Attention mask shape: torch.Size([1, 1, 18289, 18289]) Position ids shape: torch.Size([1, 18289]) Input IDs shape: torch.Size([1, 18289]) Labels shape: torch.Size([1, 18289]) Final batch size: 1, sequence length: 17852 Attention mask shape: torch.Size([1, 1, 17852, 17852]) Position ids shape: torch.Size([1, 17852]) Input IDs shape: torch.Size([1, 17852]) Labels shape: torch.Size([1, 17852]) Final batch size: 1, sequence length: 16501 Attention mask shape: torch.Size([1, 1, 16501, 16501]) Position ids shape: torch.Size([1, 16501]) Input IDs shape: torch.Size([1, 16501]) Labels shape: torch.Size([1, 16501]) Final batch size: 1, sequence length: 21562 Attention mask shape: torch.Size([1, 1, 21562, 21562]) Position ids shape: torch.Size([1, 21562]) Input IDs shape: torch.Size([1, 21562]) Labels shape: torch.Size([1, 21562]) Final batch size: 1, sequence length: 22784 Attention mask shape: torch.Size([1, 1, 22784, 22784]) Position ids shape: torch.Size([1, 22784]) Input IDs shape: torch.Size([1, 22784]) Labels shape: torch.Size([1, 22784]) Final batch size: 1, sequence length: 20813 Attention mask shape: torch.Size([1, 1, 20813, 20813]) Position ids shape: torch.Size([1, 20813]) Input IDs shape: torch.Size([1, 20813]) Labels shape: torch.Size([1, 20813]) Final batch size: 1, sequence length: 20766 Attention mask shape: torch.Size([1, 1, 20766, 20766]) Position ids shape: torch.Size([1, 20766]) Input IDs shape: torch.Size([1, 20766]) Labels shape: torch.Size([1, 20766]) Final batch size: 1, sequence length: 22197 Attention mask shape: torch.Size([1, 1, 22197, 22197]) Position ids shape: torch.Size([1, 22197]) Input IDs shape: torch.Size([1, 22197]) Labels shape: torch.Size([1, 22197]) Final batch size: 1, sequence length: 21207 Attention mask shape: torch.Size([1, 1, 21207, 21207]) Position ids shape: torch.Size([1, 21207]) Input IDs shape: torch.Size([1, 21207]) Labels shape: torch.Size([1, 21207]) Final batch size: 1, sequence length: 22117 Attention mask shape: torch.Size([1, 1, 22117, 22117]) Position ids shape: torch.Size([1, 22117]) Input IDs shape: torch.Size([1, 22117]) Labels shape: torch.Size([1, 22117]) Final batch size: 1, sequence length: 20176 Attention mask shape: torch.Size([1, 1, 20176, 20176]) Position ids shape: torch.Size([1, 20176]) Input IDs shape: torch.Size([1, 20176]) Labels shape: torch.Size([1, 20176]) Final batch size: 1, sequence length: 21705 Attention mask shape: torch.Size([1, 1, 21705, 21705]) Position ids shape: torch.Size([1, 21705]) Input IDs shape: torch.Size([1, 21705]) Labels shape: torch.Size([1, 21705]) Final batch size: 1, sequence length: 21316 Attention mask shape: torch.Size([1, 1, 21316, 21316]) Position ids shape: torch.Size([1, 21316]) Input IDs shape: torch.Size([1, 21316]) Labels shape: torch.Size([1, 21316]) Final batch size: 1, sequence length: 25880 Attention mask shape: torch.Size([1, 1, 25880, 25880]) Position ids shape: torch.Size([1, 25880]) Input IDs shape: torch.Size([1, 25880]) Labels shape: torch.Size([1, 25880]) Final batch size: 1, sequence length: 21607 Attention mask shape: torch.Size([1, 1, 21607, 21607]) Position ids shape: torch.Size([1, 21607]) Input IDs shape: torch.Size([1, 21607]) Labels shape: torch.Size([1, 21607]) Final batch size: 1, sequence length: 22152 Attention mask shape: torch.Size([1, 1, 22152, 22152]) Position ids shape: torch.Size([1, 22152]) Input IDs shape: torch.Size([1, 22152]) Labels shape: torch.Size([1, 22152]) Final batch size: 1, sequence length: 25053 Attention mask shape: torch.Size([1, 1, 25053, 25053]) Position ids shape: torch.Size([1, 25053]) Input IDs shape: torch.Size([1, 25053]) Labels shape: torch.Size([1, 25053]) Final batch size: 1, sequence length: 27005 Attention mask shape: torch.Size([1, 1, 27005, 27005]) Position ids shape: torch.Size([1, 27005]) Input IDs shape: torch.Size([1, 27005]) Labels shape: torch.Size([1, 27005]) Final batch size: 1, sequence length: 26494 Attention mask shape: torch.Size([1, 1, 26494, 26494]) Position ids shape: torch.Size([1, 26494]) Input IDs shape: torch.Size([1, 26494]) Labels shape: torch.Size([1, 26494]) Final batch size: 1, sequence length: 26921 Attention mask shape: torch.Size([1, 1, 26921, 26921]) Position ids shape: torch.Size([1, 26921]) Input IDs shape: torch.Size([1, 26921]) Labels shape: torch.Size([1, 26921]) Final batch size: 1, sequence length: 26303 Attention mask shape: torch.Size([1, 1, 26303, 26303]) Position ids shape: torch.Size([1, 26303]) Input IDs shape: torch.Size([1, 26303]) Labels shape: torch.Size([1, 26303]) Final batch size: 1, sequence length: 30592 Attention mask shape: torch.Size([1, 1, 30592, 30592]) Position ids shape: torch.Size([1, 30592]) Input IDs shape: torch.Size([1, 30592]) Labels shape: torch.Size([1, 30592]) Final batch size: 1, sequence length: 30151 Attention mask shape: torch.Size([1, 1, 30151, 30151]) Position ids shape: torch.Size([1, 30151]) Input IDs shape: torch.Size([1, 30151]) Labels shape: torch.Size([1, 30151]) Final batch size: 1, sequence length: 28523 Attention mask shape: torch.Size([1, 1, 28523, 28523]) Position ids shape: torch.Size([1, 28523]) Input IDs shape: torch.Size([1, 28523]) Labels shape: torch.Size([1, 28523]) Final batch size: 1, sequence length: 32473 Attention mask shape: torch.Size([1, 1, 32473, 32473]) Position ids shape: torch.Size([1, 32473]) Input IDs shape: torch.Size([1, 32473]) Labels shape: torch.Size([1, 32473]) Final batch size: 1, sequence length: 28897 Attention mask shape: torch.Size([1, 1, 28897, 28897]) Position ids shape: torch.Size([1, 28897]) Input IDs shape: torch.Size([1, 28897]) Labels shape: torch.Size([1, 28897]) Final batch size: 1, sequence length: 33482 Attention mask shape: torch.Size([1, 1, 33482, 33482]) Position ids shape: torch.Size([1, 33482]) Input IDs shape: torch.Size([1, 33482]) Labels shape: torch.Size([1, 33482]) Final batch size: 1, sequence length: 32596 Attention mask shape: torch.Size([1, 1, 32596, 32596]) Position ids shape: torch.Size([1, 32596]) Input IDs shape: torch.Size([1, 32596]) Labels shape: torch.Size([1, 32596]) Final batch size: 1, sequence length: 35100 Attention mask shape: torch.Size([1, 1, 35100, 35100]) Position ids shape: torch.Size([1, 35100]) Input IDs shape: torch.Size([1, 35100]) Labels shape: torch.Size([1, 35100]) Final batch size: 1, sequence length: 35523 Attention mask shape: torch.Size([1, 1, 35523, 35523]) Position ids shape: torch.Size([1, 35523]) Input IDs shape: torch.Size([1, 35523]) Labels shape: torch.Size([1, 35523]) Final batch size: 1, sequence length: 35866 Attention mask shape: torch.Size([1, 1, 35866, 35866]) Position ids shape: torch.Size([1, 35866]) Input IDs shape: torch.Size([1, 35866]) Labels shape: torch.Size([1, 35866]) Final batch size: 1, sequence length: 35485 Attention mask shape: torch.Size([1, 1, 35485, 35485]) Position ids shape: torch.Size([1, 35485]) Input IDs shape: torch.Size([1, 35485]) Labels shape: torch.Size([1, 35485]) Final batch size: 1, sequence length: 35030 Attention mask shape: torch.Size([1, 1, 35030, 35030]) Position ids shape: torch.Size([1, 35030]) Input IDs shape: torch.Size([1, 35030]) Labels shape: torch.Size([1, 35030]) Final batch size: 1, sequence length: 34469 Attention mask shape: torch.Size([1, 1, 34469, 34469]) Position ids shape: torch.Size([1, 34469]) Input IDs shape: torch.Size([1, 34469]) Labels shape: torch.Size([1, 34469]) Final batch size: 1, sequence length: 39589 Attention mask shape: torch.Size([1, 1, 39589, 39589]) Position ids shape: torch.Size([1, 39589]) Input IDs shape: torch.Size([1, 39589]) Labels shape: torch.Size([1, 39589]) Final batch size: 1, sequence length: 40526 Attention mask shape: torch.Size([1, 1, 40526, 40526]) Position ids shape: torch.Size([1, 40526]) Input IDs shape: torch.Size([1, 40526]) Labels shape: torch.Size([1, 40526]) Final batch size: 1, sequence length: 37980 Attention mask shape: torch.Size([1, 1, 37980, 37980]) Position ids shape: torch.Size([1, 37980]) Input IDs shape: torch.Size([1, 37980]) Labels shape: torch.Size([1, 37980]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 37040 Attention mask shape: torch.Size([1, 1, 37040, 37040]) Position ids shape: torch.Size([1, 37040]) Input IDs shape: torch.Size([1, 37040]) Labels shape: torch.Size([1, 37040]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 38927 Attention mask shape: torch.Size([1, 1, 38927, 38927]) Position ids shape: torch.Size([1, 38927]) Input IDs shape: torch.Size([1, 38927]) Labels shape: torch.Size([1, 38927]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) {'loss': 0.2812, 'grad_norm': 0.33603851764140297, 'learning_rate': 7.723195175075136e-06, 'num_tokens': -inf, 'epoch': 3.0} Final batch size: 1, sequence length: 5686 Attention mask shape: torch.Size([1, 1, 5686, 5686]) Position ids shape: torch.Size([1, 5686]) Input IDs shape: torch.Size([1, 5686]) Labels shape: torch.Size([1, 5686]) Final batch size: 1, sequence length: 6986 Attention mask shape: torch.Size([1, 1, 6986, 6986]) Position ids shape: torch.Size([1, 6986]) Input IDs shape: torch.Size([1, 6986]) Labels shape: torch.Size([1, 6986]) Final batch size: 1, sequence length: 6362 Attention mask shape: torch.Size([1, 1, 6362, 6362]) Position ids shape: torch.Size([1, 6362]) Input IDs shape: torch.Size([1, 6362]) Labels shape: torch.Size([1, 6362]) Final batch size: 1, sequence length: 10523 Attention mask shape: torch.Size([1, 1, 10523, 10523]) Position ids shape: torch.Size([1, 10523]) Input IDs shape: torch.Size([1, 10523]) Labels shape: torch.Size([1, 10523]) Final batch size: 1, sequence length: 11304 Attention mask shape: torch.Size([1, 1, 11304, 11304]) Position ids shape: torch.Size([1, 11304]) Input IDs shape: torch.Size([1, 11304]) Labels shape: torch.Size([1, 11304]) Final batch size: 1, sequence length: 13224 Attention mask shape: torch.Size([1, 1, 13224, 13224]) Position ids shape: torch.Size([1, 13224]) Input IDs shape: torch.Size([1, 13224]) Labels shape: torch.Size([1, 13224]) Final batch size: 1, sequence length: 12847 Attention mask shape: torch.Size([1, 1, 12847, 12847]) Position ids shape: torch.Size([1, 12847]) Input IDs shape: torch.Size([1, 12847]) Labels shape: torch.Size([1, 12847]) Final batch size: 1, sequence length: 14304 Attention mask shape: torch.Size([1, 1, 14304, 14304]) Position ids shape: torch.Size([1, 14304]) Input IDs shape: torch.Size([1, 14304]) Labels shape: torch.Size([1, 14304]) Final batch size: 1, sequence length: 14168 Attention mask shape: torch.Size([1, 1, 14168, 14168]) Position ids shape: torch.Size([1, 14168]) Input IDs shape: torch.Size([1, 14168]) Labels shape: torch.Size([1, 14168]) Final batch size: 1, sequence length: 13872 Attention mask shape: torch.Size([1, 1, 13872, 13872]) Position ids shape: torch.Size([1, 13872]) Input IDs shape: torch.Size([1, 13872]) Labels shape: torch.Size([1, 13872]) Final batch size: 1, sequence length: 15652 Attention mask shape: torch.Size([1, 1, 15652, 15652]) Position ids shape: torch.Size([1, 15652]) Input IDs shape: torch.Size([1, 15652]) Labels shape: torch.Size([1, 15652]) Final batch size: 1, sequence length: 15621 Attention mask shape: torch.Size([1, 1, 15621, 15621]) Position ids shape: torch.Size([1, 15621]) Input IDs shape: torch.Size([1, 15621]) Labels shape: torch.Size([1, 15621]) Final batch size: 1, sequence length: 16378 Attention mask shape: torch.Size([1, 1, 16378, 16378]) Position ids shape: torch.Size([1, 16378]) Input IDs shape: torch.Size([1, 16378]) Labels shape: torch.Size([1, 16378]) Final batch size: 1, sequence length: 16501 Attention mask shape: torch.Size([1, 1, 16501, 16501]) Position ids shape: torch.Size([1, 16501]) Input IDs shape: torch.Size([1, 16501]) Labels shape: torch.Size([1, 16501]) Final batch size: 1, sequence length: 18044 Attention mask shape: torch.Size([1, 1, 18044, 18044]) Position ids shape: torch.Size([1, 18044]) Input IDs shape: torch.Size([1, 18044]) Labels shape: torch.Size([1, 18044]) Final batch size: 1, sequence length: 16514 Attention mask shape: torch.Size([1, 1, 16514, 16514]) Position ids shape: torch.Size([1, 16514]) Input IDs shape: torch.Size([1, 16514]) Labels shape: torch.Size([1, 16514]) Final batch size: 1, sequence length: 18246 Attention mask shape: torch.Size([1, 1, 18246, 18246]) Position ids shape: torch.Size([1, 18246]) Input IDs shape: torch.Size([1, 18246]) Labels shape: torch.Size([1, 18246]) Final batch size: 1, sequence length: 18289 Attention mask shape: torch.Size([1, 1, 18289, 18289]) Position ids shape: torch.Size([1, 18289]) Input IDs shape: torch.Size([1, 18289]) Labels shape: torch.Size([1, 18289]) Final batch size: 1, sequence length: 16440 Attention mask shape: torch.Size([1, 1, 16440, 16440]) Position ids shape: torch.Size([1, 16440]) Input IDs shape: torch.Size([1, 16440]) Labels shape: torch.Size([1, 16440]) Final batch size: 1, sequence length: 17213 Attention mask shape: torch.Size([1, 1, 17213, 17213]) Position ids shape: torch.Size([1, 17213]) Input IDs shape: torch.Size([1, 17213]) Labels shape: torch.Size([1, 17213]) Final batch size: 1, sequence length: 18155 Attention mask shape: torch.Size([1, 1, 18155, 18155]) Position ids shape: torch.Size([1, 18155]) Input IDs shape: torch.Size([1, 18155]) Labels shape: torch.Size([1, 18155]) Final batch size: 1, sequence length: 20813 Attention mask shape: torch.Size([1, 1, 20813, 20813]) Position ids shape: torch.Size([1, 20813]) Input IDs shape: torch.Size([1, 20813]) Labels shape: torch.Size([1, 20813]) Final batch size: 1, sequence length: 21163 Attention mask shape: torch.Size([1, 1, 21163, 21163]) Position ids shape: torch.Size([1, 21163]) Input IDs shape: torch.Size([1, 21163]) Labels shape: torch.Size([1, 21163]) Final batch size: 1, sequence length: 17852 Attention mask shape: torch.Size([1, 1, 17852, 17852]) Position ids shape: torch.Size([1, 17852]) Input IDs shape: torch.Size([1, 17852]) Labels shape: torch.Size([1, 17852]) Final batch size: 1, sequence length: 21316 Attention mask shape: torch.Size([1, 1, 21316, 21316]) Position ids shape: torch.Size([1, 21316]) Input IDs shape: torch.Size([1, 21316]) Labels shape: torch.Size([1, 21316]) Final batch size: 1, sequence length: 21207 Attention mask shape: torch.Size([1, 1, 21207, 21207]) Position ids shape: torch.Size([1, 21207]) Input IDs shape: torch.Size([1, 21207]) Labels shape: torch.Size([1, 21207]) Final batch size: 1, sequence length: 20176 Attention mask shape: torch.Size([1, 1, 20176, 20176]) Position ids shape: torch.Size([1, 20176]) Input IDs shape: torch.Size([1, 20176]) Labels shape: torch.Size([1, 20176]) Final batch size: 1, sequence length: 21562 Attention mask shape: torch.Size([1, 1, 21562, 21562]) Position ids shape: torch.Size([1, 21562]) Input IDs shape: torch.Size([1, 21562]) Labels shape: torch.Size([1, 21562]) Final batch size: 1, sequence length: 20766 Attention mask shape: torch.Size([1, 1, 20766, 20766]) Position ids shape: torch.Size([1, 20766]) Input IDs shape: torch.Size([1, 20766]) Labels shape: torch.Size([1, 20766]) Final batch size: 1, sequence length: 21705 Attention mask shape: torch.Size([1, 1, 21705, 21705]) Position ids shape: torch.Size([1, 21705]) Input IDs shape: torch.Size([1, 21705]) Labels shape: torch.Size([1, 21705]) Final batch size: 1, sequence length: 5911 Attention mask shape: torch.Size([1, 1, 5911, 5911]) Position ids shape: torch.Size([1, 5911]) Input IDs shape: torch.Size([1, 5911]) Labels shape: torch.Size([1, 5911]) Final batch size: 1, sequence length: 17773 Attention mask shape: torch.Size([1, 1, 17773, 17773]) Position ids shape: torch.Size([1, 17773]) Input IDs shape: torch.Size([1, 17773]) Labels shape: torch.Size([1, 17773]) Final batch size: 1, sequence length: 22152 Attention mask shape: torch.Size([1, 1, 22152, 22152]) Position ids shape: torch.Size([1, 22152]) Input IDs shape: torch.Size([1, 22152]) Labels shape: torch.Size([1, 22152]) Final batch size: 1, sequence length: 22197 Attention mask shape: torch.Size([1, 1, 22197, 22197]) Position ids shape: torch.Size([1, 22197]) Input IDs shape: torch.Size([1, 22197]) Labels shape: torch.Size([1, 22197]) Final batch size: 1, sequence length: 18545 Attention mask shape: torch.Size([1, 1, 18545, 18545]) Position ids shape: torch.Size([1, 18545]) Input IDs shape: torch.Size([1, 18545]) Labels shape: torch.Size([1, 18545]) Final batch size: 1, sequence length: 22784 Attention mask shape: torch.Size([1, 1, 22784, 22784]) Position ids shape: torch.Size([1, 22784]) Input IDs shape: torch.Size([1, 22784]) Labels shape: torch.Size([1, 22784]) Final batch size: 1, sequence length: 14496 Attention mask shape: torch.Size([1, 1, 14496, 14496]) Position ids shape: torch.Size([1, 14496]) Input IDs shape: torch.Size([1, 14496]) Labels shape: torch.Size([1, 14496]) Final batch size: 1, sequence length: 25053 Attention mask shape: torch.Size([1, 1, 25053, 25053]) Position ids shape: torch.Size([1, 25053]) Input IDs shape: torch.Size([1, 25053]) Labels shape: torch.Size([1, 25053]) Final batch size: 1, sequence length: 17910 Attention mask shape: torch.Size([1, 1, 17910, 17910]) Position ids shape: torch.Size([1, 17910]) Input IDs shape: torch.Size([1, 17910]) Labels shape: torch.Size([1, 17910]) Final batch size: 1, sequence length: 22117 Attention mask shape: torch.Size([1, 1, 22117, 22117]) Position ids shape: torch.Size([1, 22117]) Input IDs shape: torch.Size([1, 22117]) Labels shape: torch.Size([1, 22117]) Final batch size: 1, sequence length: 12421 Attention mask shape: torch.Size([1, 1, 12421, 12421]) Position ids shape: torch.Size([1, 12421]) Input IDs shape: torch.Size([1, 12421]) Labels shape: torch.Size([1, 12421]) Final batch size: 1, sequence length: 21607 Attention mask shape: torch.Size([1, 1, 21607, 21607]) Position ids shape: torch.Size([1, 21607]) Input IDs shape: torch.Size([1, 21607]) Labels shape: torch.Size([1, 21607]) Final batch size: 1, sequence length: 20101 Attention mask shape: torch.Size([1, 1, 20101, 20101]) Position ids shape: torch.Size([1, 20101]) Input IDs shape: torch.Size([1, 20101]) Labels shape: torch.Size([1, 20101]) Final batch size: 1, sequence length: 19962 Attention mask shape: torch.Size([1, 1, 19962, 19962]) Position ids shape: torch.Size([1, 19962]) Input IDs shape: torch.Size([1, 19962]) Labels shape: torch.Size([1, 19962]) Final batch size: 1, sequence length: 21962 Attention mask shape: torch.Size([1, 1, 21962, 21962]) Position ids shape: torch.Size([1, 21962]) Input IDs shape: torch.Size([1, 21962]) Labels shape: torch.Size([1, 21962]) Final batch size: 1, sequence length: 26494 Attention mask shape: torch.Size([1, 1, 26494, 26494]) Position ids shape: torch.Size([1, 26494]) Input IDs shape: torch.Size([1, 26494]) Labels shape: torch.Size([1, 26494]) Final batch size: 1, sequence length: 26921 Attention mask shape: torch.Size([1, 1, 26921, 26921]) Position ids shape: torch.Size([1, 26921]) Input IDs shape: torch.Size([1, 26921]) Labels shape: torch.Size([1, 26921]) Final batch size: 1, sequence length: 11515 Attention mask shape: torch.Size([1, 1, 11515, 11515]) Position ids shape: torch.Size([1, 11515]) Input IDs shape: torch.Size([1, 11515]) Labels shape: torch.Size([1, 11515]) Final batch size: 1, sequence length: 21672 Attention mask shape: torch.Size([1, 1, 21672, 21672]) Position ids shape: torch.Size([1, 21672]) Input IDs shape: torch.Size([1, 21672]) Labels shape: torch.Size([1, 21672]) Final batch size: 1, sequence length: 13031 Attention mask shape: torch.Size([1, 1, 13031, 13031]) Position ids shape: torch.Size([1, 13031]) Input IDs shape: torch.Size([1, 13031]) Labels shape: torch.Size([1, 13031]) Final batch size: 1, sequence length: 27633 Attention mask shape: torch.Size([1, 1, 27633, 27633]) Position ids shape: torch.Size([1, 27633]) Input IDs shape: torch.Size([1, 27633]) Labels shape: torch.Size([1, 27633]) Final batch size: 1, sequence length: 26303 Attention mask shape: torch.Size([1, 1, 26303, 26303]) Position ids shape: torch.Size([1, 26303]) Input IDs shape: torch.Size([1, 26303]) Labels shape: torch.Size([1, 26303]) Final batch size: 1, sequence length: 23338 Attention mask shape: torch.Size([1, 1, 23338, 23338]) Position ids shape: torch.Size([1, 23338]) Input IDs shape: torch.Size([1, 23338]) Labels shape: torch.Size([1, 23338]) Final batch size: 1, sequence length: 24365 Attention mask shape: torch.Size([1, 1, 24365, 24365]) Position ids shape: torch.Size([1, 24365]) Input IDs shape: torch.Size([1, 24365]) Labels shape: torch.Size([1, 24365]) Final batch size: 1, sequence length: 26138 Attention mask shape: torch.Size([1, 1, 26138, 26138]) Position ids shape: torch.Size([1, 26138]) Input IDs shape: torch.Size([1, 26138]) Labels shape: torch.Size([1, 26138]) Final batch size: 1, sequence length: 12224 Attention mask shape: torch.Size([1, 1, 12224, 12224]) Position ids shape: torch.Size([1, 12224]) Input IDs shape: torch.Size([1, 12224]) Labels shape: torch.Size([1, 12224]) Final batch size: 1, sequence length: 18470 Attention mask shape: torch.Size([1, 1, 18470, 18470]) Position ids shape: torch.Size([1, 18470]) Input IDs shape: torch.Size([1, 18470]) Labels shape: torch.Size([1, 18470]) Final batch size: 1, sequence length: 23975 Attention mask shape: torch.Size([1, 1, 23975, 23975]) Position ids shape: torch.Size([1, 23975]) Input IDs shape: torch.Size([1, 23975]) Labels shape: torch.Size([1, 23975]) Final batch size: 1, sequence length: 9704 Attention mask shape: torch.Size([1, 1, 9704, 9704]) Position ids shape: torch.Size([1, 9704]) Input IDs shape: torch.Size([1, 9704]) Labels shape: torch.Size([1, 9704]) Final batch size: 1, sequence length: 16057 Attention mask shape: torch.Size([1, 1, 16057, 16057]) Position ids shape: torch.Size([1, 16057]) Input IDs shape: torch.Size([1, 16057]) Labels shape: torch.Size([1, 16057]) Final batch size: 1, sequence length: 30592 Attention mask shape: torch.Size([1, 1, 30592, 30592]) Position ids shape: torch.Size([1, 30592]) Input IDs shape: torch.Size([1, 30592]) Labels shape: torch.Size([1, 30592]) Final batch size: 1, sequence length: 28897 Attention mask shape: torch.Size([1, 1, 28897, 28897]) Position ids shape: torch.Size([1, 28897]) Input IDs shape: torch.Size([1, 28897]) Labels shape: torch.Size([1, 28897]) Final batch size: 1, sequence length: 21766 Attention mask shape: torch.Size([1, 1, 21766, 21766]) Position ids shape: torch.Size([1, 21766]) Input IDs shape: torch.Size([1, 21766]) Labels shape: torch.Size([1, 21766]) Final batch size: 1, sequence length: 32473 Attention mask shape: torch.Size([1, 1, 32473, 32473]) Position ids shape: torch.Size([1, 32473]) Input IDs shape: torch.Size([1, 32473]) Labels shape: torch.Size([1, 32473]) Final batch size: 1, sequence length: 28523 Attention mask shape: torch.Size([1, 1, 28523, 28523]) Position ids shape: torch.Size([1, 28523]) Input IDs shape: torch.Size([1, 28523]) Labels shape: torch.Size([1, 28523]) Final batch size: 1, sequence length: 29404 Attention mask shape: torch.Size([1, 1, 29404, 29404]) Position ids shape: torch.Size([1, 29404]) Input IDs shape: torch.Size([1, 29404]) Labels shape: torch.Size([1, 29404]) Final batch size: 1, sequence length: 21581 Attention mask shape: torch.Size([1, 1, 21581, 21581]) Position ids shape: torch.Size([1, 21581]) Input IDs shape: torch.Size([1, 21581]) Labels shape: torch.Size([1, 21581]) Final batch size: 1, sequence length: 25880 Attention mask shape: torch.Size([1, 1, 25880, 25880]) Position ids shape: torch.Size([1, 25880]) Input IDs shape: torch.Size([1, 25880]) Labels shape: torch.Size([1, 25880]) Final batch size: 1, sequence length: 30965 Attention mask shape: torch.Size([1, 1, 30965, 30965]) Position ids shape: torch.Size([1, 30965]) Input IDs shape: torch.Size([1, 30965]) Labels shape: torch.Size([1, 30965]) Final batch size: 1, sequence length: 23558 Attention mask shape: torch.Size([1, 1, 23558, 23558]) Position ids shape: torch.Size([1, 23558]) Input IDs shape: torch.Size([1, 23558]) Labels shape: torch.Size([1, 23558]) Final batch size: 1, sequence length: 33482 Attention mask shape: torch.Size([1, 1, 33482, 33482]) Position ids shape: torch.Size([1, 33482]) Input IDs shape: torch.Size([1, 33482]) Labels shape: torch.Size([1, 33482]) Final batch size: 1, sequence length: 24121 Attention mask shape: torch.Size([1, 1, 24121, 24121]) Position ids shape: torch.Size([1, 24121]) Input IDs shape: torch.Size([1, 24121]) Labels shape: torch.Size([1, 24121]) Final batch size: 1, sequence length: 17456 Attention mask shape: torch.Size([1, 1, 17456, 17456]) Position ids shape: torch.Size([1, 17456]) Input IDs shape: torch.Size([1, 17456]) Labels shape: torch.Size([1, 17456]) Final batch size: 1, sequence length: 35485 Attention mask shape: torch.Size([1, 1, 35485, 35485]) Position ids shape: torch.Size([1, 35485]) Input IDs shape: torch.Size([1, 35485]) Labels shape: torch.Size([1, 35485]) Final batch size: 1, sequence length: 20198 Attention mask shape: torch.Size([1, 1, 20198, 20198]) Position ids shape: torch.Size([1, 20198]) Input IDs shape: torch.Size([1, 20198]) Labels shape: torch.Size([1, 20198]) Final batch size: 1, sequence length: 34469 Attention mask shape: torch.Size([1, 1, 34469, 34469]) Position ids shape: torch.Size([1, 34469]) Input IDs shape: torch.Size([1, 34469]) Labels shape: torch.Size([1, 34469]) Final batch size: 1, sequence length: 35100 Attention mask shape: torch.Size([1, 1, 35100, 35100]) Position ids shape: torch.Size([1, 35100]) Input IDs shape: torch.Size([1, 35100]) Labels shape: torch.Size([1, 35100]) Final batch size: 1, sequence length: 18126 Attention mask shape: torch.Size([1, 1, 18126, 18126]) Position ids shape: torch.Size([1, 18126]) Input IDs shape: torch.Size([1, 18126]) Labels shape: torch.Size([1, 18126]) Final batch size: 1, sequence length: 29481 Attention mask shape: torch.Size([1, 1, 29481, 29481]) Position ids shape: torch.Size([1, 29481]) Input IDs shape: torch.Size([1, 29481]) Labels shape: torch.Size([1, 29481]) Final batch size: 1, sequence length: 19620 Attention mask shape: torch.Size([1, 1, 19620, 19620]) Position ids shape: torch.Size([1, 19620]) Input IDs shape: torch.Size([1, 19620]) Labels shape: torch.Size([1, 19620]) Final batch size: 1, sequence length: 20184 Attention mask shape: torch.Size([1, 1, 20184, 20184]) Position ids shape: torch.Size([1, 20184]) Input IDs shape: torch.Size([1, 20184]) Labels shape: torch.Size([1, 20184]) Final batch size: 1, sequence length: 28773 Attention mask shape: torch.Size([1, 1, 28773, 28773]) Position ids shape: torch.Size([1, 28773]) Input IDs shape: torch.Size([1, 28773]) Labels shape: torch.Size([1, 28773]) Final batch size: 1, sequence length: 30428 Attention mask shape: torch.Size([1, 1, 30428, 30428]) Position ids shape: torch.Size([1, 30428]) Input IDs shape: torch.Size([1, 30428]) Labels shape: torch.Size([1, 30428]) Final batch size: 1, sequence length: 18023 Attention mask shape: torch.Size([1, 1, 18023, 18023]) Position ids shape: torch.Size([1, 18023]) Input IDs shape: torch.Size([1, 18023]) Labels shape: torch.Size([1, 18023]) Final batch size: 1, sequence length: 21491 Attention mask shape: torch.Size([1, 1, 21491, 21491]) Position ids shape: torch.Size([1, 21491]) Input IDs shape: torch.Size([1, 21491]) Labels shape: torch.Size([1, 21491]) Final batch size: 1, sequence length: 32596 Attention mask shape: torch.Size([1, 1, 32596, 32596]) Position ids shape: torch.Size([1, 32596]) Input IDs shape: torch.Size([1, 32596]) Labels shape: torch.Size([1, 32596]) Final batch size: 1, sequence length: 33685 Attention mask shape: torch.Size([1, 1, 33685, 33685]) Position ids shape: torch.Size([1, 33685]) Input IDs shape: torch.Size([1, 33685]) Labels shape: torch.Size([1, 33685]) Final batch size: 1, sequence length: 35030 Attention mask shape: torch.Size([1, 1, 35030, 35030]) Position ids shape: torch.Size([1, 35030]) Input IDs shape: torch.Size([1, 35030]) Labels shape: torch.Size([1, 35030]) Final batch size: 1, sequence length: 11608 Attention mask shape: torch.Size([1, 1, 11608, 11608]) Position ids shape: torch.Size([1, 11608]) Input IDs shape: torch.Size([1, 11608]) Labels shape: torch.Size([1, 11608]) Final batch size: 1, sequence length: 16257 Attention mask shape: torch.Size([1, 1, 16257, 16257]) Position ids shape: torch.Size([1, 16257]) Input IDs shape: torch.Size([1, 16257]) Labels shape: torch.Size([1, 16257]) Final batch size: 1, sequence length: 18454 Attention mask shape: torch.Size([1, 1, 18454, 18454]) Position ids shape: torch.Size([1, 18454]) Input IDs shape: torch.Size([1, 18454]) Labels shape: torch.Size([1, 18454]) Final batch size: 1, sequence length: 17870 Attention mask shape: torch.Size([1, 1, 17870, 17870]) Position ids shape: torch.Size([1, 17870]) Input IDs shape: torch.Size([1, 17870]) Labels shape: torch.Size([1, 17870]) Final batch size: 1, sequence length: 37040 Attention mask shape: torch.Size([1, 1, 37040, 37040]) Position ids shape: torch.Size([1, 37040]) Input IDs shape: torch.Size([1, 37040]) Labels shape: torch.Size([1, 37040]) Final batch size: 1, sequence length: 40526 Attention mask shape: torch.Size([1, 1, 40526, 40526]) Position ids shape: torch.Size([1, 40526]) Input IDs shape: torch.Size([1, 40526]) Labels shape: torch.Size([1, 40526]) Final batch size: 1, sequence length: 29537 Attention mask shape: torch.Size([1, 1, 29537, 29537]) Position ids shape: torch.Size([1, 29537]) Input IDs shape: torch.Size([1, 29537]) Labels shape: torch.Size([1, 29537]) Final batch size: 1, sequence length: 26215 Attention mask shape: torch.Size([1, 1, 26215, 26215]) Position ids shape: torch.Size([1, 26215]) Input IDs shape: torch.Size([1, 26215]) Labels shape: torch.Size([1, 26215]) Final batch size: 1, sequence length: 27702 Attention mask shape: torch.Size([1, 1, 27702, 27702]) Position ids shape: torch.Size([1, 27702]) Input IDs shape: torch.Size([1, 27702]) Labels shape: torch.Size([1, 27702]) Final batch size: 1, sequence length: 30072 Attention mask shape: torch.Size([1, 1, 30072, 30072]) Position ids shape: torch.Size([1, 30072]) Input IDs shape: torch.Size([1, 30072]) Labels shape: torch.Size([1, 30072]) Final batch size: 1, sequence length: 39589 Attention mask shape: torch.Size([1, 1, 39589, 39589]) Position ids shape: torch.Size([1, 39589]) Input IDs shape: torch.Size([1, 39589]) Labels shape: torch.Size([1, 39589]) Final batch size: 1, sequence length: 26635 Attention mask shape: torch.Size([1, 1, 26635, 26635]) Position ids shape: torch.Size([1, 26635]) Input IDs shape: torch.Size([1, 26635]) Labels shape: torch.Size([1, 26635]) Final batch size: 1, sequence length: 30789 Attention mask shape: torch.Size([1, 1, 30789, 30789]) Position ids shape: torch.Size([1, 30789]) Input IDs shape: torch.Size([1, 30789]) Labels shape: torch.Size([1, 30789]) Final batch size: 1, sequence length: 37980 Attention mask shape: torch.Size([1, 1, 37980, 37980]) Position ids shape: torch.Size([1, 37980]) Input IDs shape: torch.Size([1, 37980]) Labels shape: torch.Size([1, 37980]) Final batch size: 1, sequence length: 38927 Attention mask shape: torch.Size([1, 1, 38927, 38927]) Position ids shape: torch.Size([1, 38927]) Input IDs shape: torch.Size([1, 38927]) Labels shape: torch.Size([1, 38927]) Final batch size: 1, sequence length: 30859 Attention mask shape: torch.Size([1, 1, 30859, 30859]) Position ids shape: torch.Size([1, 30859]) Input IDs shape: torch.Size([1, 30859]) Labels shape: torch.Size([1, 30859]) Final batch size: 1, sequence length: 11403 Attention mask shape: torch.Size([1, 1, 11403, 11403]) Position ids shape: torch.Size([1, 11403]) Input IDs shape: torch.Size([1, 11403]) Labels shape: torch.Size([1, 11403]) Final batch size: 1, sequence length: 13622 Attention mask shape: torch.Size([1, 1, 13622, 13622]) Position ids shape: torch.Size([1, 13622]) Input IDs shape: torch.Size([1, 13622]) Labels shape: torch.Size([1, 13622]) Final batch size: 1, sequence length: 17811 Attention mask shape: torch.Size([1, 1, 17811, 17811]) Position ids shape: torch.Size([1, 17811]) Input IDs shape: torch.Size([1, 17811]) Labels shape: torch.Size([1, 17811]) Final batch size: 1, sequence length: 26708 Attention mask shape: torch.Size([1, 1, 26708, 26708]) Position ids shape: torch.Size([1, 26708]) Input IDs shape: torch.Size([1, 26708]) Labels shape: torch.Size([1, 26708]) Final batch size: 1, sequence length: 29875 Attention mask shape: torch.Size([1, 1, 29875, 29875]) Position ids shape: torch.Size([1, 29875]) Input IDs shape: torch.Size([1, 29875]) Labels shape: torch.Size([1, 29875]) Final batch size: 1, sequence length: 19702 Attention mask shape: torch.Size([1, 1, 19702, 19702]) Position ids shape: torch.Size([1, 19702]) Input IDs shape: torch.Size([1, 19702]) Labels shape: torch.Size([1, 19702]) Final batch size: 1, sequence length: 19538 Attention mask shape: torch.Size([1, 1, 19538, 19538]) Position ids shape: torch.Size([1, 19538]) Input IDs shape: torch.Size([1, 19538]) Labels shape: torch.Size([1, 19538]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 30772 Attention mask shape: torch.Size([1, 1, 30772, 30772]) Position ids shape: torch.Size([1, 30772]) Input IDs shape: torch.Size([1, 30772]) Labels shape: torch.Size([1, 30772]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 22309 Attention mask shape: torch.Size([1, 1, 22309, 22309]) Position ids shape: torch.Size([1, 22309]) Input IDs shape: torch.Size([1, 22309]) Labels shape: torch.Size([1, 22309]) Final batch size: 1, sequence length: 37992 Attention mask shape: torch.Size([1, 1, 37992, 37992]) Position ids shape: torch.Size([1, 37992]) Input IDs shape: torch.Size([1, 37992]) Labels shape: torch.Size([1, 37992]) Final batch size: 1, sequence length: 36580 Attention mask shape: torch.Size([1, 1, 36580, 36580]) Position ids shape: torch.Size([1, 36580]) Input IDs shape: torch.Size([1, 36580]) Labels shape: torch.Size([1, 36580]) Final batch size: 1, sequence length: 15830 Attention mask shape: torch.Size([1, 1, 15830, 15830]) Position ids shape: torch.Size([1, 15830]) Input IDs shape: torch.Size([1, 15830]) Labels shape: torch.Size([1, 15830]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 21250 Attention mask shape: torch.Size([1, 1, 21250, 21250]) Position ids shape: torch.Size([1, 21250]) Input IDs shape: torch.Size([1, 21250]) Labels shape: torch.Size([1, 21250]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 34684 Attention mask shape: torch.Size([1, 1, 34684, 34684]) Position ids shape: torch.Size([1, 34684]) Input IDs shape: torch.Size([1, 34684]) Labels shape: torch.Size([1, 34684]) Final batch size: 1, sequence length: 15993 Attention mask shape: torch.Size([1, 1, 15993, 15993]) Position ids shape: torch.Size([1, 15993]) Input IDs shape: torch.Size([1, 15993]) Labels shape: torch.Size([1, 15993]) Final batch size: 1, sequence length: 21936 Attention mask shape: torch.Size([1, 1, 21936, 21936]) Position ids shape: torch.Size([1, 21936]) Input IDs shape: torch.Size([1, 21936]) Labels shape: torch.Size([1, 21936]) Final batch size: 1, sequence length: 17623 Attention mask shape: torch.Size([1, 1, 17623, 17623]) Position ids shape: torch.Size([1, 17623]) Input IDs shape: torch.Size([1, 17623]) Labels shape: torch.Size([1, 17623]) Final batch size: 1, sequence length: 28814 Attention mask shape: torch.Size([1, 1, 28814, 28814]) Position ids shape: torch.Size([1, 28814]) Input IDs shape: torch.Size([1, 28814]) Labels shape: torch.Size([1, 28814]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 31745 Attention mask shape: torch.Size([1, 1, 31745, 31745]) Position ids shape: torch.Size([1, 31745]) Input IDs shape: torch.Size([1, 31745]) Labels shape: torch.Size([1, 31745]) Final batch size: 1, sequence length: 17971 Attention mask shape: torch.Size([1, 1, 17971, 17971]) Position ids shape: torch.Size([1, 17971]) Input IDs shape: torch.Size([1, 17971]) Labels shape: torch.Size([1, 17971]) Final batch size: 1, sequence length: 29464 Attention mask shape: torch.Size([1, 1, 29464, 29464]) Position ids shape: torch.Size([1, 29464]) Input IDs shape: torch.Size([1, 29464]) Labels shape: torch.Size([1, 29464]) Final batch size: 1, sequence length: 39661 Attention mask shape: torch.Size([1, 1, 39661, 39661]) Position ids shape: torch.Size([1, 39661]) Input IDs shape: torch.Size([1, 39661]) Labels shape: torch.Size([1, 39661]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 14730 Attention mask shape: torch.Size([1, 1, 14730, 14730]) Position ids shape: torch.Size([1, 14730]) Input IDs shape: torch.Size([1, 14730]) Labels shape: torch.Size([1, 14730]) Final batch size: 1, sequence length: 18606 Attention mask shape: torch.Size([1, 1, 18606, 18606]) Position ids shape: torch.Size([1, 18606]) Input IDs shape: torch.Size([1, 18606]) Labels shape: torch.Size([1, 18606]) Final batch size: 1, sequence length: 29639 Attention mask shape: torch.Size([1, 1, 29639, 29639]) Position ids shape: torch.Size([1, 29639]) Input IDs shape: torch.Size([1, 29639]) Labels shape: torch.Size([1, 29639]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 24232 Attention mask shape: torch.Size([1, 1, 24232, 24232]) Position ids shape: torch.Size([1, 24232]) Input IDs shape: torch.Size([1, 24232]) Labels shape: torch.Size([1, 24232]) Final batch size: 1, sequence length: 26665 Attention mask shape: torch.Size([1, 1, 26665, 26665]) Position ids shape: torch.Size([1, 26665]) Input IDs shape: torch.Size([1, 26665]) Labels shape: torch.Size([1, 26665]) Final batch size: 1, sequence length: 35077 Attention mask shape: torch.Size([1, 1, 35077, 35077]) Position ids shape: torch.Size([1, 35077]) Input IDs shape: torch.Size([1, 35077]) Labels shape: torch.Size([1, 35077]) Final batch size: 1, sequence length: 22945 Attention mask shape: torch.Size([1, 1, 22945, 22945]) Position ids shape: torch.Size([1, 22945]) Input IDs shape: torch.Size([1, 22945]) Labels shape: torch.Size([1, 22945]) Final batch size: 1, sequence length: 13903 Attention mask shape: torch.Size([1, 1, 13903, 13903]) Position ids shape: torch.Size([1, 13903]) Input IDs shape: torch.Size([1, 13903]) Labels shape: torch.Size([1, 13903]) Final batch size: 1, sequence length: 36716 Attention mask shape: torch.Size([1, 1, 36716, 36716]) Position ids shape: torch.Size([1, 36716]) Input IDs shape: torch.Size([1, 36716]) Labels shape: torch.Size([1, 36716]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 19028 Attention mask shape: torch.Size([1, 1, 19028, 19028]) Position ids shape: torch.Size([1, 19028]) Input IDs shape: torch.Size([1, 19028]) Labels shape: torch.Size([1, 19028]) Final batch size: 1, sequence length: 17778 Attention mask shape: torch.Size([1, 1, 17778, 17778]) Position ids shape: torch.Size([1, 17778]) Input IDs shape: torch.Size([1, 17778]) Labels shape: torch.Size([1, 17778]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 30135 Attention mask shape: torch.Size([1, 1, 30135, 30135]) Position ids shape: torch.Size([1, 30135]) Input IDs shape: torch.Size([1, 30135]) Labels shape: torch.Size([1, 30135]) Final batch size: 1, sequence length: 26930 Attention mask shape: torch.Size([1, 1, 26930, 26930]) Position ids shape: torch.Size([1, 26930]) Input IDs shape: torch.Size([1, 26930]) Labels shape: torch.Size([1, 26930]) Final batch size: 1, sequence length: 16270 Attention mask shape: torch.Size([1, 1, 16270, 16270]) Position ids shape: torch.Size([1, 16270]) Input IDs shape: torch.Size([1, 16270]) Labels shape: torch.Size([1, 16270]) Final batch size: 1, sequence length: 35952 Attention mask shape: torch.Size([1, 1, 35952, 35952]) Position ids shape: torch.Size([1, 35952]) Input IDs shape: torch.Size([1, 35952]) Labels shape: torch.Size([1, 35952]) Final batch size: 1, sequence length: 20765 Attention mask shape: torch.Size([1, 1, 20765, 20765]) Position ids shape: torch.Size([1, 20765]) Input IDs shape: torch.Size([1, 20765]) Labels shape: torch.Size([1, 20765]) Final batch size: 1, sequence length: 28810 Attention mask shape: torch.Size([1, 1, 28810, 28810]) Position ids shape: torch.Size([1, 28810]) Input IDs shape: torch.Size([1, 28810]) Labels shape: torch.Size([1, 28810]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 20057 Attention mask shape: torch.Size([1, 1, 20057, 20057]) Position ids shape: torch.Size([1, 20057]) Input IDs shape: torch.Size([1, 20057]) Labels shape: torch.Size([1, 20057]) Final batch size: 1, sequence length: 37866 Attention mask shape: torch.Size([1, 1, 37866, 37866]) Position ids shape: torch.Size([1, 37866]) Input IDs shape: torch.Size([1, 37866]) Labels shape: torch.Size([1, 37866]) Final batch size: 1, sequence length: 31381 Attention mask shape: torch.Size([1, 1, 31381, 31381]) Position ids shape: torch.Size([1, 31381]) Input IDs shape: torch.Size([1, 31381]) Labels shape: torch.Size([1, 31381]) Final batch size: 1, sequence length: 27281 Attention mask shape: torch.Size([1, 1, 27281, 27281]) Position ids shape: torch.Size([1, 27281]) Input IDs shape: torch.Size([1, 27281]) Labels shape: torch.Size([1, 27281]) Final batch size: 1, sequence length: 21006 Attention mask shape: torch.Size([1, 1, 21006, 21006]) Position ids shape: torch.Size([1, 21006]) Input IDs shape: torch.Size([1, 21006]) Labels shape: torch.Size([1, 21006]) Final batch size: 1, sequence length: 31420 Attention mask shape: torch.Size([1, 1, 31420, 31420]) Position ids shape: torch.Size([1, 31420]) Input IDs shape: torch.Size([1, 31420]) Labels shape: torch.Size([1, 31420]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 32024 Attention mask shape: torch.Size([1, 1, 32024, 32024]) Position ids shape: torch.Size([1, 32024]) Input IDs shape: torch.Size([1, 32024]) Labels shape: torch.Size([1, 32024]) Final batch size: 1, sequence length: 25032 Attention mask shape: torch.Size([1, 1, 25032, 25032]) Position ids shape: torch.Size([1, 25032]) Input IDs shape: torch.Size([1, 25032]) Labels shape: torch.Size([1, 25032]) Final batch size: 1, sequence length: 37407 Attention mask shape: torch.Size([1, 1, 37407, 37407]) Position ids shape: torch.Size([1, 37407]) Input IDs shape: torch.Size([1, 37407]) Labels shape: torch.Size([1, 37407]) Final batch size: 1, sequence length: 21071 Attention mask shape: torch.Size([1, 1, 21071, 21071]) Position ids shape: torch.Size([1, 21071]) Input IDs shape: torch.Size([1, 21071]) Labels shape: torch.Size([1, 21071]) Final batch size: 1, sequence length: 37397 Attention mask shape: torch.Size([1, 1, 37397, 37397]) Position ids shape: torch.Size([1, 37397]) Input IDs shape: torch.Size([1, 37397]) Labels shape: torch.Size([1, 37397]) Final batch size: 1, sequence length: 14104 Attention mask shape: torch.Size([1, 1, 14104, 14104]) Position ids shape: torch.Size([1, 14104]) Input IDs shape: torch.Size([1, 14104]) Labels shape: torch.Size([1, 14104]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 35879 Attention mask shape: torch.Size([1, 1, 35879, 35879]) Position ids shape: torch.Size([1, 35879]) Input IDs shape: torch.Size([1, 35879]) Labels shape: torch.Size([1, 35879]) Final batch size: 1, sequence length: 25388 Attention mask shape: torch.Size([1, 1, 25388, 25388]) Position ids shape: torch.Size([1, 25388]) Input IDs shape: torch.Size([1, 25388]) Labels shape: torch.Size([1, 25388]) Final batch size: 1, sequence length: 21805 Attention mask shape: torch.Size([1, 1, 21805, 21805]) Position ids shape: torch.Size([1, 21805]) Input IDs shape: torch.Size([1, 21805]) Labels shape: torch.Size([1, 21805]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 18566 Attention mask shape: torch.Size([1, 1, 18566, 18566]) Position ids shape: torch.Size([1, 18566]) Input IDs shape: torch.Size([1, 18566]) Labels shape: torch.Size([1, 18566]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 25383 Attention mask shape: torch.Size([1, 1, 25383, 25383]) Position ids shape: torch.Size([1, 25383]) Input IDs shape: torch.Size([1, 25383]) Labels shape: torch.Size([1, 25383]) Final batch size: 1, sequence length: 40579 Attention mask shape: torch.Size([1, 1, 40579, 40579]) Position ids shape: torch.Size([1, 40579]) Input IDs shape: torch.Size([1, 40579]) Labels shape: torch.Size([1, 40579]) Final batch size: 1, sequence length: 36415 Attention mask shape: torch.Size([1, 1, 36415, 36415]) Position ids shape: torch.Size([1, 36415]) Input IDs shape: torch.Size([1, 36415]) Labels shape: torch.Size([1, 36415]) Final batch size: 1, sequence length: 39587 Attention mask shape: torch.Size([1, 1, 39587, 39587]) Position ids shape: torch.Size([1, 39587]) Input IDs shape: torch.Size([1, 39587]) Labels shape: torch.Size([1, 39587]) Final batch size: 1, sequence length: 23187 Attention mask shape: torch.Size([1, 1, 23187, 23187]) Position ids shape: torch.Size([1, 23187]) Input IDs shape: torch.Size([1, 23187]) Labels shape: torch.Size([1, 23187]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 39778 Attention mask shape: torch.Size([1, 1, 39778, 39778]) Position ids shape: torch.Size([1, 39778]) Input IDs shape: torch.Size([1, 39778]) Labels shape: torch.Size([1, 39778]) Final batch size: 1, sequence length: 32541 Attention mask shape: torch.Size([1, 1, 32541, 32541]) Position ids shape: torch.Size([1, 32541]) Input IDs shape: torch.Size([1, 32541]) Labels shape: torch.Size([1, 32541]) Final batch size: 1, sequence length: 37530 Attention mask shape: torch.Size([1, 1, 37530, 37530]) Position ids shape: torch.Size([1, 37530]) Input IDs shape: torch.Size([1, 37530]) Labels shape: torch.Size([1, 37530]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 12200 Attention mask shape: torch.Size([1, 1, 12200, 12200]) Position ids shape: torch.Size([1, 12200]) Input IDs shape: torch.Size([1, 12200]) Labels shape: torch.Size([1, 12200]) Final batch size: 1, sequence length: 19668 Attention mask shape: torch.Size([1, 1, 19668, 19668]) Position ids shape: torch.Size([1, 19668]) Input IDs shape: torch.Size([1, 19668]) Labels shape: torch.Size([1, 19668]) Final batch size: 1, sequence length: 39579 Attention mask shape: torch.Size([1, 1, 39579, 39579]) Position ids shape: torch.Size([1, 39579]) Input IDs shape: torch.Size([1, 39579]) Labels shape: torch.Size([1, 39579]) Final batch size: 1, sequence length: 14308 Attention mask shape: torch.Size([1, 1, 14308, 14308]) Position ids shape: torch.Size([1, 14308]) Input IDs shape: torch.Size([1, 14308]) Labels shape: torch.Size([1, 14308]) Final batch size: 1, sequence length: 9251 Attention mask shape: torch.Size([1, 1, 9251, 9251]) Position ids shape: torch.Size([1, 9251]) Input IDs shape: torch.Size([1, 9251]) Labels shape: torch.Size([1, 9251]) Final batch size: 1, sequence length: 37890 Attention mask shape: torch.Size([1, 1, 37890, 37890]) Position ids shape: torch.Size([1, 37890]) Input IDs shape: torch.Size([1, 37890]) Labels shape: torch.Size([1, 37890]) Final batch size: 1, sequence length: 36647 Attention mask shape: torch.Size([1, 1, 36647, 36647]) Position ids shape: torch.Size([1, 36647]) Input IDs shape: torch.Size([1, 36647]) Labels shape: torch.Size([1, 36647]) {'loss': 0.2705, 'grad_norm': 0.3487038834821939, 'learning_rate': 7.500000000000001e-06, 'num_tokens': -inf, 'epoch': 3.12} Final batch size: 1, sequence length: 4858 Attention mask shape: torch.Size([1, 1, 4858, 4858]) Position ids shape: torch.Size([1, 4858]) Input IDs shape: torch.Size([1, 4858]) Labels shape: torch.Size([1, 4858]) Final batch size: 1, sequence length: 6316 Attention mask shape: torch.Size([1, 1, 6316, 6316]) Position ids shape: torch.Size([1, 6316]) Input IDs shape: torch.Size([1, 6316]) Labels shape: torch.Size([1, 6316]) Final batch size: 1, sequence length: 11448 Attention mask shape: torch.Size([1, 1, 11448, 11448]) Position ids shape: torch.Size([1, 11448]) Input IDs shape: torch.Size([1, 11448]) Labels shape: torch.Size([1, 11448]) Final batch size: 1, sequence length: 12846 Attention mask shape: torch.Size([1, 1, 12846, 12846]) Position ids shape: torch.Size([1, 12846]) Input IDs shape: torch.Size([1, 12846]) Labels shape: torch.Size([1, 12846]) Final batch size: 1, sequence length: 12075 Attention mask shape: torch.Size([1, 1, 12075, 12075]) Position ids shape: torch.Size([1, 12075]) Input IDs shape: torch.Size([1, 12075]) Labels shape: torch.Size([1, 12075]) Final batch size: 1, sequence length: 7360 Attention mask shape: torch.Size([1, 1, 7360, 7360]) Position ids shape: torch.Size([1, 7360]) Input IDs shape: torch.Size([1, 7360]) Labels shape: torch.Size([1, 7360]) Final batch size: 1, sequence length: 12945 Attention mask shape: torch.Size([1, 1, 12945, 12945]) Position ids shape: torch.Size([1, 12945]) Input IDs shape: torch.Size([1, 12945]) Labels shape: torch.Size([1, 12945]) Final batch size: 1, sequence length: 15189 Attention mask shape: torch.Size([1, 1, 15189, 15189]) Position ids shape: torch.Size([1, 15189]) Input IDs shape: torch.Size([1, 15189]) Labels shape: torch.Size([1, 15189]) Final batch size: 1, sequence length: 14891 Attention mask shape: torch.Size([1, 1, 14891, 14891]) Position ids shape: torch.Size([1, 14891]) Input IDs shape: torch.Size([1, 14891]) Labels shape: torch.Size([1, 14891]) Final batch size: 1, sequence length: 14330 Attention mask shape: torch.Size([1, 1, 14330, 14330]) Position ids shape: torch.Size([1, 14330]) Input IDs shape: torch.Size([1, 14330]) Labels shape: torch.Size([1, 14330]) Final batch size: 1, sequence length: 16961 Attention mask shape: torch.Size([1, 1, 16961, 16961]) Position ids shape: torch.Size([1, 16961]) Input IDs shape: torch.Size([1, 16961]) Labels shape: torch.Size([1, 16961]) Final batch size: 1, sequence length: 17246 Attention mask shape: torch.Size([1, 1, 17246, 17246]) Position ids shape: torch.Size([1, 17246]) Input IDs shape: torch.Size([1, 17246]) Labels shape: torch.Size([1, 17246]) Final batch size: 1, sequence length: 16658 Attention mask shape: torch.Size([1, 1, 16658, 16658]) Position ids shape: torch.Size([1, 16658]) Input IDs shape: torch.Size([1, 16658]) Labels shape: torch.Size([1, 16658]) Final batch size: 1, sequence length: 16391 Attention mask shape: torch.Size([1, 1, 16391, 16391]) Position ids shape: torch.Size([1, 16391]) Input IDs shape: torch.Size([1, 16391]) Labels shape: torch.Size([1, 16391]) Final batch size: 1, sequence length: 16536 Attention mask shape: torch.Size([1, 1, 16536, 16536]) Position ids shape: torch.Size([1, 16536]) Input IDs shape: torch.Size([1, 16536]) Labels shape: torch.Size([1, 16536]) Final batch size: 1, sequence length: 18341 Attention mask shape: torch.Size([1, 1, 18341, 18341]) Position ids shape: torch.Size([1, 18341]) Input IDs shape: torch.Size([1, 18341]) Labels shape: torch.Size([1, 18341]) Final batch size: 1, sequence length: 16326 Attention mask shape: torch.Size([1, 1, 16326, 16326]) Position ids shape: torch.Size([1, 16326]) Input IDs shape: torch.Size([1, 16326]) Labels shape: torch.Size([1, 16326]) Final batch size: 1, sequence length: 18400 Attention mask shape: torch.Size([1, 1, 18400, 18400]) Position ids shape: torch.Size([1, 18400]) Input IDs shape: torch.Size([1, 18400]) Labels shape: torch.Size([1, 18400]) Final batch size: 1, sequence length: 18014 Attention mask shape: torch.Size([1, 1, 18014, 18014]) Position ids shape: torch.Size([1, 18014]) Input IDs shape: torch.Size([1, 18014]) Labels shape: torch.Size([1, 18014]) Final batch size: 1, sequence length: 19332 Attention mask shape: torch.Size([1, 1, 19332, 19332]) Position ids shape: torch.Size([1, 19332]) Input IDs shape: torch.Size([1, 19332]) Labels shape: torch.Size([1, 19332]) Final batch size: 1, sequence length: 18424 Attention mask shape: torch.Size([1, 1, 18424, 18424]) Position ids shape: torch.Size([1, 18424]) Input IDs shape: torch.Size([1, 18424]) Labels shape: torch.Size([1, 18424]) Final batch size: 1, sequence length: 19999 Attention mask shape: torch.Size([1, 1, 19999, 19999]) Position ids shape: torch.Size([1, 19999]) Input IDs shape: torch.Size([1, 19999]) Labels shape: torch.Size([1, 19999]) Final batch size: 1, sequence length: 21617 Attention mask shape: torch.Size([1, 1, 21617, 21617]) Position ids shape: torch.Size([1, 21617]) Input IDs shape: torch.Size([1, 21617]) Labels shape: torch.Size([1, 21617]) Final batch size: 1, sequence length: 21122 Attention mask shape: torch.Size([1, 1, 21122, 21122]) Position ids shape: torch.Size([1, 21122]) Input IDs shape: torch.Size([1, 21122]) Labels shape: torch.Size([1, 21122]) Final batch size: 1, sequence length: 20912 Attention mask shape: torch.Size([1, 1, 20912, 20912]) Position ids shape: torch.Size([1, 20912]) Input IDs shape: torch.Size([1, 20912]) Labels shape: torch.Size([1, 20912]) Final batch size: 1, sequence length: 20888 Attention mask shape: torch.Size([1, 1, 20888, 20888]) Position ids shape: torch.Size([1, 20888]) Input IDs shape: torch.Size([1, 20888]) Labels shape: torch.Size([1, 20888]) Final batch size: 1, sequence length: 19597 Attention mask shape: torch.Size([1, 1, 19597, 19597]) Position ids shape: torch.Size([1, 19597]) Input IDs shape: torch.Size([1, 19597]) Labels shape: torch.Size([1, 19597]) Final batch size: 1, sequence length: 21314 Attention mask shape: torch.Size([1, 1, 21314, 21314]) Position ids shape: torch.Size([1, 21314]) Input IDs shape: torch.Size([1, 21314]) Labels shape: torch.Size([1, 21314]) Final batch size: 1, sequence length: 20118 Attention mask shape: torch.Size([1, 1, 20118, 20118]) Position ids shape: torch.Size([1, 20118]) Input IDs shape: torch.Size([1, 20118]) Labels shape: torch.Size([1, 20118]) Final batch size: 1, sequence length: 18393 Attention mask shape: torch.Size([1, 1, 18393, 18393]) Position ids shape: torch.Size([1, 18393]) Input IDs shape: torch.Size([1, 18393]) Labels shape: torch.Size([1, 18393]) Final batch size: 1, sequence length: 16831 Attention mask shape: torch.Size([1, 1, 16831, 16831]) Position ids shape: torch.Size([1, 16831]) Input IDs shape: torch.Size([1, 16831]) Labels shape: torch.Size([1, 16831]) Final batch size: 1, sequence length: 12328 Attention mask shape: torch.Size([1, 1, 12328, 12328]) Position ids shape: torch.Size([1, 12328]) Input IDs shape: torch.Size([1, 12328]) Labels shape: torch.Size([1, 12328]) Final batch size: 1, sequence length: 13638 Attention mask shape: torch.Size([1, 1, 13638, 13638]) Position ids shape: torch.Size([1, 13638]) Input IDs shape: torch.Size([1, 13638]) Labels shape: torch.Size([1, 13638]) Final batch size: 1, sequence length: 21739 Attention mask shape: torch.Size([1, 1, 21739, 21739]) Position ids shape: torch.Size([1, 21739]) Input IDs shape: torch.Size([1, 21739]) Labels shape: torch.Size([1, 21739]) Final batch size: 1, sequence length: 22854 Attention mask shape: torch.Size([1, 1, 22854, 22854]) Position ids shape: torch.Size([1, 22854]) Input IDs shape: torch.Size([1, 22854]) Labels shape: torch.Size([1, 22854]) Final batch size: 1, sequence length: 21611 Attention mask shape: torch.Size([1, 1, 21611, 21611]) Position ids shape: torch.Size([1, 21611]) Input IDs shape: torch.Size([1, 21611]) Labels shape: torch.Size([1, 21611]) Final batch size: 1, sequence length: 25909 Attention mask shape: torch.Size([1, 1, 25909, 25909]) Position ids shape: torch.Size([1, 25909]) Input IDs shape: torch.Size([1, 25909]) Labels shape: torch.Size([1, 25909]) Final batch size: 1, sequence length: 15232 Attention mask shape: torch.Size([1, 1, 15232, 15232]) Position ids shape: torch.Size([1, 15232]) Input IDs shape: torch.Size([1, 15232]) Labels shape: torch.Size([1, 15232]) Final batch size: 1, sequence length: 6839 Attention mask shape: torch.Size([1, 1, 6839, 6839]) Position ids shape: torch.Size([1, 6839]) Input IDs shape: torch.Size([1, 6839]) Labels shape: torch.Size([1, 6839]) Final batch size: 1, sequence length: 9341 Attention mask shape: torch.Size([1, 1, 9341, 9341]) Position ids shape: torch.Size([1, 9341]) Input IDs shape: torch.Size([1, 9341]) Labels shape: torch.Size([1, 9341]) Final batch size: 1, sequence length: 24597 Attention mask shape: torch.Size([1, 1, 24597, 24597]) Position ids shape: torch.Size([1, 24597]) Input IDs shape: torch.Size([1, 24597]) Labels shape: torch.Size([1, 24597]) Final batch size: 1, sequence length: 25042 Attention mask shape: torch.Size([1, 1, 25042, 25042]) Position ids shape: torch.Size([1, 25042]) Input IDs shape: torch.Size([1, 25042]) Labels shape: torch.Size([1, 25042]) Final batch size: 1, sequence length: 16611 Attention mask shape: torch.Size([1, 1, 16611, 16611]) Position ids shape: torch.Size([1, 16611]) Input IDs shape: torch.Size([1, 16611]) Labels shape: torch.Size([1, 16611]) Final batch size: 1, sequence length: 19885 Attention mask shape: torch.Size([1, 1, 19885, 19885]) Position ids shape: torch.Size([1, 19885]) Input IDs shape: torch.Size([1, 19885]) Labels shape: torch.Size([1, 19885]) Final batch size: 1, sequence length: 23614 Attention mask shape: torch.Size([1, 1, 23614, 23614]) Position ids shape: torch.Size([1, 23614]) Input IDs shape: torch.Size([1, 23614]) Labels shape: torch.Size([1, 23614]) Final batch size: 1, sequence length: 15026 Attention mask shape: torch.Size([1, 1, 15026, 15026]) Position ids shape: torch.Size([1, 15026]) Input IDs shape: torch.Size([1, 15026]) Labels shape: torch.Size([1, 15026]) Final batch size: 1, sequence length: 26218 Attention mask shape: torch.Size([1, 1, 26218, 26218]) Position ids shape: torch.Size([1, 26218]) Input IDs shape: torch.Size([1, 26218]) Labels shape: torch.Size([1, 26218]) Final batch size: 1, sequence length: 13646 Attention mask shape: torch.Size([1, 1, 13646, 13646]) Position ids shape: torch.Size([1, 13646]) Input IDs shape: torch.Size([1, 13646]) Labels shape: torch.Size([1, 13646]) Final batch size: 1, sequence length: 21725 Attention mask shape: torch.Size([1, 1, 21725, 21725]) Position ids shape: torch.Size([1, 21725]) Input IDs shape: torch.Size([1, 21725]) Labels shape: torch.Size([1, 21725]) Final batch size: 1, sequence length: 29730 Attention mask shape: torch.Size([1, 1, 29730, 29730]) Position ids shape: torch.Size([1, 29730]) Input IDs shape: torch.Size([1, 29730]) Labels shape: torch.Size([1, 29730]) Final batch size: 1, sequence length: 20619 Attention mask shape: torch.Size([1, 1, 20619, 20619]) Position ids shape: torch.Size([1, 20619]) Input IDs shape: torch.Size([1, 20619]) Labels shape: torch.Size([1, 20619]) Final batch size: 1, sequence length: 27795 Attention mask shape: torch.Size([1, 1, 27795, 27795]) Position ids shape: torch.Size([1, 27795]) Input IDs shape: torch.Size([1, 27795]) Labels shape: torch.Size([1, 27795]) Final batch size: 1, sequence length: 21677 Attention mask shape: torch.Size([1, 1, 21677, 21677]) Position ids shape: torch.Size([1, 21677]) Input IDs shape: torch.Size([1, 21677]) Labels shape: torch.Size([1, 21677]) Final batch size: 1, sequence length: 24342 Attention mask shape: torch.Size([1, 1, 24342, 24342]) Position ids shape: torch.Size([1, 24342]) Input IDs shape: torch.Size([1, 24342]) Labels shape: torch.Size([1, 24342]) Final batch size: 1, sequence length: 24090 Attention mask shape: torch.Size([1, 1, 24090, 24090]) Position ids shape: torch.Size([1, 24090]) Input IDs shape: torch.Size([1, 24090]) Labels shape: torch.Size([1, 24090]) Final batch size: 1, sequence length: 28348 Attention mask shape: torch.Size([1, 1, 28348, 28348]) Position ids shape: torch.Size([1, 28348]) Input IDs shape: torch.Size([1, 28348]) Labels shape: torch.Size([1, 28348]) Final batch size: 1, sequence length: 30366 Attention mask shape: torch.Size([1, 1, 30366, 30366]) Position ids shape: torch.Size([1, 30366]) Input IDs shape: torch.Size([1, 30366]) Labels shape: torch.Size([1, 30366]) Final batch size: 1, sequence length: 25575 Attention mask shape: torch.Size([1, 1, 25575, 25575]) Position ids shape: torch.Size([1, 25575]) Input IDs shape: torch.Size([1, 25575]) Labels shape: torch.Size([1, 25575]) Final batch size: 1, sequence length: 20702 Attention mask shape: torch.Size([1, 1, 20702, 20702]) Position ids shape: torch.Size([1, 20702]) Input IDs shape: torch.Size([1, 20702]) Labels shape: torch.Size([1, 20702]) Final batch size: 1, sequence length: 17780 Attention mask shape: torch.Size([1, 1, 17780, 17780]) Position ids shape: torch.Size([1, 17780]) Input IDs shape: torch.Size([1, 17780]) Labels shape: torch.Size([1, 17780]) Final batch size: 1, sequence length: 21623 Attention mask shape: torch.Size([1, 1, 21623, 21623]) Position ids shape: torch.Size([1, 21623]) Input IDs shape: torch.Size([1, 21623]) Labels shape: torch.Size([1, 21623]) Final batch size: 1, sequence length: 22217 Attention mask shape: torch.Size([1, 1, 22217, 22217]) Position ids shape: torch.Size([1, 22217]) Input IDs shape: torch.Size([1, 22217]) Labels shape: torch.Size([1, 22217]) Final batch size: 1, sequence length: 30917 Attention mask shape: torch.Size([1, 1, 30917, 30917]) Position ids shape: torch.Size([1, 30917]) Input IDs shape: torch.Size([1, 30917]) Labels shape: torch.Size([1, 30917]) Final batch size: 1, sequence length: 32034 Attention mask shape: torch.Size([1, 1, 32034, 32034]) Position ids shape: torch.Size([1, 32034]) Input IDs shape: torch.Size([1, 32034]) Labels shape: torch.Size([1, 32034]) Final batch size: 1, sequence length: 30410 Attention mask shape: torch.Size([1, 1, 30410, 30410]) Position ids shape: torch.Size([1, 30410]) Input IDs shape: torch.Size([1, 30410]) Labels shape: torch.Size([1, 30410]) Final batch size: 1, sequence length: 20915 Attention mask shape: torch.Size([1, 1, 20915, 20915]) Position ids shape: torch.Size([1, 20915]) Input IDs shape: torch.Size([1, 20915]) Labels shape: torch.Size([1, 20915]) Final batch size: 1, sequence length: 28678 Attention mask shape: torch.Size([1, 1, 28678, 28678]) Position ids shape: torch.Size([1, 28678]) Input IDs shape: torch.Size([1, 28678]) Labels shape: torch.Size([1, 28678]) Final batch size: 1, sequence length: 16403 Attention mask shape: torch.Size([1, 1, 16403, 16403]) Position ids shape: torch.Size([1, 16403]) Input IDs shape: torch.Size([1, 16403]) Labels shape: torch.Size([1, 16403]) Final batch size: 1, sequence length: 30796 Attention mask shape: torch.Size([1, 1, 30796, 30796]) Position ids shape: torch.Size([1, 30796]) Input IDs shape: torch.Size([1, 30796]) Labels shape: torch.Size([1, 30796]) Final batch size: 1, sequence length: 30766 Attention mask shape: torch.Size([1, 1, 30766, 30766]) Position ids shape: torch.Size([1, 30766]) Input IDs shape: torch.Size([1, 30766]) Labels shape: torch.Size([1, 30766]) Final batch size: 1, sequence length: 35628 Attention mask shape: torch.Size([1, 1, 35628, 35628]) Position ids shape: torch.Size([1, 35628]) Input IDs shape: torch.Size([1, 35628]) Labels shape: torch.Size([1, 35628]) Final batch size: 1, sequence length: 32308 Attention mask shape: torch.Size([1, 1, 32308, 32308]) Position ids shape: torch.Size([1, 32308]) Input IDs shape: torch.Size([1, 32308]) Labels shape: torch.Size([1, 32308]) Final batch size: 1, sequence length: 25698 Attention mask shape: torch.Size([1, 1, 25698, 25698]) Position ids shape: torch.Size([1, 25698]) Input IDs shape: torch.Size([1, 25698]) Labels shape: torch.Size([1, 25698]) Final batch size: 1, sequence length: 35781 Attention mask shape: torch.Size([1, 1, 35781, 35781]) Position ids shape: torch.Size([1, 35781]) Input IDs shape: torch.Size([1, 35781]) Labels shape: torch.Size([1, 35781]) Final batch size: 1, sequence length: 32630 Attention mask shape: torch.Size([1, 1, 32630, 32630]) Position ids shape: torch.Size([1, 32630]) Input IDs shape: torch.Size([1, 32630]) Labels shape: torch.Size([1, 32630]) Final batch size: 1, sequence length: 35697 Attention mask shape: torch.Size([1, 1, 35697, 35697]) Position ids shape: torch.Size([1, 35697]) Input IDs shape: torch.Size([1, 35697]) Labels shape: torch.Size([1, 35697]) Final batch size: 1, sequence length: 37195 Attention mask shape: torch.Size([1, 1, 37195, 37195]) Position ids shape: torch.Size([1, 37195]) Input IDs shape: torch.Size([1, 37195]) Labels shape: torch.Size([1, 37195]) Final batch size: 1, sequence length: 23626 Attention mask shape: torch.Size([1, 1, 23626, 23626]) Position ids shape: torch.Size([1, 23626]) Input IDs shape: torch.Size([1, 23626]) Labels shape: torch.Size([1, 23626]) Final batch size: 1, sequence length: 15726 Attention mask shape: torch.Size([1, 1, 15726, 15726]) Position ids shape: torch.Size([1, 15726]) Input IDs shape: torch.Size([1, 15726]) Labels shape: torch.Size([1, 15726]) Final batch size: 1, sequence length: 38362 Attention mask shape: torch.Size([1, 1, 38362, 38362]) Position ids shape: torch.Size([1, 38362]) Input IDs shape: torch.Size([1, 38362]) Labels shape: torch.Size([1, 38362]) Final batch size: 1, sequence length: 21562 Attention mask shape: torch.Size([1, 1, 21562, 21562]) Position ids shape: torch.Size([1, 21562]) Input IDs shape: torch.Size([1, 21562]) Labels shape: torch.Size([1, 21562]) Final batch size: 1, sequence length: 32239 Attention mask shape: torch.Size([1, 1, 32239, 32239]) Position ids shape: torch.Size([1, 32239]) Input IDs shape: torch.Size([1, 32239]) Labels shape: torch.Size([1, 32239]) Final batch size: 1, sequence length: 14070 Attention mask shape: torch.Size([1, 1, 14070, 14070]) Position ids shape: torch.Size([1, 14070]) Input IDs shape: torch.Size([1, 14070]) Labels shape: torch.Size([1, 14070]) Final batch size: 1, sequence length: 20028 Attention mask shape: torch.Size([1, 1, 20028, 20028]) Position ids shape: torch.Size([1, 20028]) Input IDs shape: torch.Size([1, 20028]) Labels shape: torch.Size([1, 20028]) Final batch size: 1, sequence length: 34802 Attention mask shape: torch.Size([1, 1, 34802, 34802]) Position ids shape: torch.Size([1, 34802]) Input IDs shape: torch.Size([1, 34802]) Labels shape: torch.Size([1, 34802]) Final batch size: 1, sequence length: 38420 Attention mask shape: torch.Size([1, 1, 38420, 38420]) Position ids shape: torch.Size([1, 38420]) Input IDs shape: torch.Size([1, 38420]) Labels shape: torch.Size([1, 38420]) Final batch size: 1, sequence length: 39608 Attention mask shape: torch.Size([1, 1, 39608, 39608]) Position ids shape: torch.Size([1, 39608]) Input IDs shape: torch.Size([1, 39608]) Labels shape: torch.Size([1, 39608]) Final batch size: 1, sequence length: 11896 Attention mask shape: torch.Size([1, 1, 11896, 11896]) Position ids shape: torch.Size([1, 11896]) Input IDs shape: torch.Size([1, 11896]) Labels shape: torch.Size([1, 11896]) Final batch size: 1, sequence length: 38190 Attention mask shape: torch.Size([1, 1, 38190, 38190]) Position ids shape: torch.Size([1, 38190]) Input IDs shape: torch.Size([1, 38190]) Labels shape: torch.Size([1, 38190]) Final batch size: 1, sequence length: 22199 Attention mask shape: torch.Size([1, 1, 22199, 22199]) Position ids shape: torch.Size([1, 22199]) Input IDs shape: torch.Size([1, 22199]) Labels shape: torch.Size([1, 22199]) Final batch size: 1, sequence length: 22768 Attention mask shape: torch.Size([1, 1, 22768, 22768]) Position ids shape: torch.Size([1, 22768]) Input IDs shape: torch.Size([1, 22768]) Labels shape: torch.Size([1, 22768]) Final batch size: 1, sequence length: 14828 Attention mask shape: torch.Size([1, 1, 14828, 14828]) Position ids shape: torch.Size([1, 14828]) Input IDs shape: torch.Size([1, 14828]) Labels shape: torch.Size([1, 14828]) Final batch size: 1, sequence length: 29202 Attention mask shape: torch.Size([1, 1, 29202, 29202]) Position ids shape: torch.Size([1, 29202]) Input IDs shape: torch.Size([1, 29202]) Labels shape: torch.Size([1, 29202]) Final batch size: 1, sequence length: 18568 Attention mask shape: torch.Size([1, 1, 18568, 18568]) Position ids shape: torch.Size([1, 18568]) Input IDs shape: torch.Size([1, 18568]) Labels shape: torch.Size([1, 18568]) Final batch size: 1, sequence length: 10426 Attention mask shape: torch.Size([1, 1, 10426, 10426]) Position ids shape: torch.Size([1, 10426]) Input IDs shape: torch.Size([1, 10426]) Labels shape: torch.Size([1, 10426]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 34785 Attention mask shape: torch.Size([1, 1, 34785, 34785]) Position ids shape: torch.Size([1, 34785]) Input IDs shape: torch.Size([1, 34785]) Labels shape: torch.Size([1, 34785]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 26287 Attention mask shape: torch.Size([1, 1, 26287, 26287]) Position ids shape: torch.Size([1, 26287]) Input IDs shape: torch.Size([1, 26287]) Labels shape: torch.Size([1, 26287]) Final batch size: 1, sequence length: 31143 Attention mask shape: torch.Size([1, 1, 31143, 31143]) Position ids shape: torch.Size([1, 31143]) Input IDs shape: torch.Size([1, 31143]) Labels shape: torch.Size([1, 31143]) Final batch size: 1, sequence length: 37529 Attention mask shape: torch.Size([1, 1, 37529, 37529]) Position ids shape: torch.Size([1, 37529]) Input IDs shape: torch.Size([1, 37529]) Labels shape: torch.Size([1, 37529]) Final batch size: 1, sequence length: 30651 Attention mask shape: torch.Size([1, 1, 30651, 30651]) Position ids shape: torch.Size([1, 30651]) Input IDs shape: torch.Size([1, 30651]) Labels shape: torch.Size([1, 30651]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 26840 Attention mask shape: torch.Size([1, 1, 26840, 26840]) Position ids shape: torch.Size([1, 26840]) Input IDs shape: torch.Size([1, 26840]) Labels shape: torch.Size([1, 26840]) Final batch size: 1, sequence length: 16304 Attention mask shape: torch.Size([1, 1, 16304, 16304]) Position ids shape: torch.Size([1, 16304]) Input IDs shape: torch.Size([1, 16304]) Labels shape: torch.Size([1, 16304]) Final batch size: 1, sequence length: 40088 Attention mask shape: torch.Size([1, 1, 40088, 40088]) Position ids shape: torch.Size([1, 40088]) Input IDs shape: torch.Size([1, 40088]) Labels shape: torch.Size([1, 40088]) Final batch size: 1, sequence length: 35914 Attention mask shape: torch.Size([1, 1, 35914, 35914]) Position ids shape: torch.Size([1, 35914]) Input IDs shape: torch.Size([1, 35914]) Labels shape: torch.Size([1, 35914]) Final batch size: 1, sequence length: 27819 Attention mask shape: torch.Size([1, 1, 27819, 27819]) Position ids shape: torch.Size([1, 27819]) Input IDs shape: torch.Size([1, 27819]) Labels shape: torch.Size([1, 27819]) Final batch size: 1, sequence length: 18859 Attention mask shape: torch.Size([1, 1, 18859, 18859]) Position ids shape: torch.Size([1, 18859]) Input IDs shape: torch.Size([1, 18859]) Labels shape: torch.Size([1, 18859]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 10198 Attention mask shape: torch.Size([1, 1, 10198, 10198]) Position ids shape: torch.Size([1, 10198]) Input IDs shape: torch.Size([1, 10198]) Labels shape: torch.Size([1, 10198]) Final batch size: 1, sequence length: 32559 Attention mask shape: torch.Size([1, 1, 32559, 32559]) Position ids shape: torch.Size([1, 32559]) Input IDs shape: torch.Size([1, 32559]) Labels shape: torch.Size([1, 32559]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 36638 Attention mask shape: torch.Size([1, 1, 36638, 36638]) Position ids shape: torch.Size([1, 36638]) Input IDs shape: torch.Size([1, 36638]) Labels shape: torch.Size([1, 36638]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 19494 Attention mask shape: torch.Size([1, 1, 19494, 19494]) Position ids shape: torch.Size([1, 19494]) Input IDs shape: torch.Size([1, 19494]) Labels shape: torch.Size([1, 19494]) Final batch size: 1, sequence length: 40763 Attention mask shape: torch.Size([1, 1, 40763, 40763]) Position ids shape: torch.Size([1, 40763]) Input IDs shape: torch.Size([1, 40763]) Labels shape: torch.Size([1, 40763]) Final batch size: 1, sequence length: 31316 Attention mask shape: torch.Size([1, 1, 31316, 31316]) Position ids shape: torch.Size([1, 31316]) Input IDs shape: torch.Size([1, 31316]) Labels shape: torch.Size([1, 31316]) Final batch size: 1, sequence length: 31754 Attention mask shape: torch.Size([1, 1, 31754, 31754]) Position ids shape: torch.Size([1, 31754]) Input IDs shape: torch.Size([1, 31754]) Labels shape: torch.Size([1, 31754]) Final batch size: 1, sequence length: 30835 Attention mask shape: torch.Size([1, 1, 30835, 30835]) Position ids shape: torch.Size([1, 30835]) Input IDs shape: torch.Size([1, 30835]) Labels shape: torch.Size([1, 30835]) Final batch size: 1, sequence length: 23936 Attention mask shape: torch.Size([1, 1, 23936, 23936]) Position ids shape: torch.Size([1, 23936]) Input IDs shape: torch.Size([1, 23936]) Labels shape: torch.Size([1, 23936]) Final batch size: 1, sequence length: 12605 Attention mask shape: torch.Size([1, 1, 12605, 12605]) Position ids shape: torch.Size([1, 12605]) Input IDs shape: torch.Size([1, 12605]) Labels shape: torch.Size([1, 12605]) Final batch size: 1, sequence length: 26269 Attention mask shape: torch.Size([1, 1, 26269, 26269]) Position ids shape: torch.Size([1, 26269]) Input IDs shape: torch.Size([1, 26269]) Labels shape: torch.Size([1, 26269]) Final batch size: 1, sequence length: 25573 Attention mask shape: torch.Size([1, 1, 25573, 25573]) Position ids shape: torch.Size([1, 25573]) Input IDs shape: torch.Size([1, 25573]) Labels shape: torch.Size([1, 25573]) Final batch size: 1, sequence length: 37170 Attention mask shape: torch.Size([1, 1, 37170, 37170]) Position ids shape: torch.Size([1, 37170]) Input IDs shape: torch.Size([1, 37170]) Labels shape: torch.Size([1, 37170]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) {'loss': 0.2946, 'grad_norm': 0.40689145021584255, 'learning_rate': 7.269952498697734e-06, 'num_tokens': -inf, 'epoch': 3.25} Final batch size: 1, sequence length: 7998 Attention mask shape: torch.Size([1, 1, 7998, 7998]) Position ids shape: torch.Size([1, 7998]) Input IDs shape: torch.Size([1, 7998]) Labels shape: torch.Size([1, 7998]) Final batch size: 1, sequence length: 6925 Attention mask shape: torch.Size([1, 1, 6925, 6925]) Position ids shape: torch.Size([1, 6925]) Input IDs shape: torch.Size([1, 6925]) Labels shape: torch.Size([1, 6925]) Final batch size: 1, sequence length: 7402 Attention mask shape: torch.Size([1, 1, 7402, 7402]) Position ids shape: torch.Size([1, 7402]) Input IDs shape: torch.Size([1, 7402]) Labels shape: torch.Size([1, 7402]) Final batch size: 1, sequence length: 10102 Attention mask shape: torch.Size([1, 1, 10102, 10102]) Position ids shape: torch.Size([1, 10102]) Input IDs shape: torch.Size([1, 10102]) Labels shape: torch.Size([1, 10102]) Final batch size: 1, sequence length: 9452 Attention mask shape: torch.Size([1, 1, 9452, 9452]) Position ids shape: torch.Size([1, 9452]) Input IDs shape: torch.Size([1, 9452]) Labels shape: torch.Size([1, 9452]) Final batch size: 1, sequence length: 10862 Attention mask shape: torch.Size([1, 1, 10862, 10862]) Position ids shape: torch.Size([1, 10862]) Input IDs shape: torch.Size([1, 10862]) Labels shape: torch.Size([1, 10862]) Final batch size: 1, sequence length: 10804 Attention mask shape: torch.Size([1, 1, 10804, 10804]) Position ids shape: torch.Size([1, 10804]) Input IDs shape: torch.Size([1, 10804]) Labels shape: torch.Size([1, 10804]) Final batch size: 1, sequence length: 12719 Attention mask shape: torch.Size([1, 1, 12719, 12719]) Position ids shape: torch.Size([1, 12719]) Input IDs shape: torch.Size([1, 12719]) Labels shape: torch.Size([1, 12719]) Final batch size: 1, sequence length: 8432 Attention mask shape: torch.Size([1, 1, 8432, 8432]) Position ids shape: torch.Size([1, 8432]) Input IDs shape: torch.Size([1, 8432]) Labels shape: torch.Size([1, 8432]) Final batch size: 1, sequence length: 11947 Attention mask shape: torch.Size([1, 1, 11947, 11947]) Position ids shape: torch.Size([1, 11947]) Input IDs shape: torch.Size([1, 11947]) Labels shape: torch.Size([1, 11947]) Final batch size: 1, sequence length: 11623 Attention mask shape: torch.Size([1, 1, 11623, 11623]) Position ids shape: torch.Size([1, 11623]) Input IDs shape: torch.Size([1, 11623]) Labels shape: torch.Size([1, 11623]) Final batch size: 1, sequence length: 9243 Attention mask shape: torch.Size([1, 1, 9243, 9243]) Position ids shape: torch.Size([1, 9243]) Input IDs shape: torch.Size([1, 9243]) Labels shape: torch.Size([1, 9243]) Final batch size: 1, sequence length: 15794 Attention mask shape: torch.Size([1, 1, 15794, 15794]) Position ids shape: torch.Size([1, 15794]) Input IDs shape: torch.Size([1, 15794]) Labels shape: torch.Size([1, 15794]) Final batch size: 1, sequence length: 15053 Attention mask shape: torch.Size([1, 1, 15053, 15053]) Position ids shape: torch.Size([1, 15053]) Input IDs shape: torch.Size([1, 15053]) Labels shape: torch.Size([1, 15053]) Final batch size: 1, sequence length: 15886 Attention mask shape: torch.Size([1, 1, 15886, 15886]) Position ids shape: torch.Size([1, 15886]) Input IDs shape: torch.Size([1, 15886]) Labels shape: torch.Size([1, 15886]) Final batch size: 1, sequence length: 17376 Attention mask shape: torch.Size([1, 1, 17376, 17376]) Position ids shape: torch.Size([1, 17376]) Input IDs shape: torch.Size([1, 17376]) Labels shape: torch.Size([1, 17376]) Final batch size: 1, sequence length: 13209 Attention mask shape: torch.Size([1, 1, 13209, 13209]) Position ids shape: torch.Size([1, 13209]) Input IDs shape: torch.Size([1, 13209]) Labels shape: torch.Size([1, 13209]) Final batch size: 1, sequence length: 17918 Attention mask shape: torch.Size([1, 1, 17918, 17918]) Position ids shape: torch.Size([1, 17918]) Input IDs shape: torch.Size([1, 17918]) Labels shape: torch.Size([1, 17918]) Final batch size: 1, sequence length: 17370 Attention mask shape: torch.Size([1, 1, 17370, 17370]) Position ids shape: torch.Size([1, 17370]) Input IDs shape: torch.Size([1, 17370]) Labels shape: torch.Size([1, 17370]) Final batch size: 1, sequence length: 16060 Attention mask shape: torch.Size([1, 1, 16060, 16060]) Position ids shape: torch.Size([1, 16060]) Input IDs shape: torch.Size([1, 16060]) Labels shape: torch.Size([1, 16060]) Final batch size: 1, sequence length: 19609 Attention mask shape: torch.Size([1, 1, 19609, 19609]) Position ids shape: torch.Size([1, 19609]) Input IDs shape: torch.Size([1, 19609]) Labels shape: torch.Size([1, 19609]) Final batch size: 1, sequence length: 20185 Attention mask shape: torch.Size([1, 1, 20185, 20185]) Position ids shape: torch.Size([1, 20185]) Input IDs shape: torch.Size([1, 20185]) Labels shape: torch.Size([1, 20185]) Final batch size: 1, sequence length: 19935 Attention mask shape: torch.Size([1, 1, 19935, 19935]) Position ids shape: torch.Size([1, 19935]) Input IDs shape: torch.Size([1, 19935]) Labels shape: torch.Size([1, 19935]) Final batch size: 1, sequence length: 17767 Attention mask shape: torch.Size([1, 1, 17767, 17767]) Position ids shape: torch.Size([1, 17767]) Input IDs shape: torch.Size([1, 17767]) Labels shape: torch.Size([1, 17767]) Final batch size: 1, sequence length: 19629 Attention mask shape: torch.Size([1, 1, 19629, 19629]) Position ids shape: torch.Size([1, 19629]) Input IDs shape: torch.Size([1, 19629]) Labels shape: torch.Size([1, 19629]) Final batch size: 1, sequence length: 20056 Attention mask shape: torch.Size([1, 1, 20056, 20056]) Position ids shape: torch.Size([1, 20056]) Input IDs shape: torch.Size([1, 20056]) Labels shape: torch.Size([1, 20056]) Final batch size: 1, sequence length: 21117 Attention mask shape: torch.Size([1, 1, 21117, 21117]) Position ids shape: torch.Size([1, 21117]) Input IDs shape: torch.Size([1, 21117]) Labels shape: torch.Size([1, 21117]) Final batch size: 1, sequence length: 21771 Attention mask shape: torch.Size([1, 1, 21771, 21771]) Position ids shape: torch.Size([1, 21771]) Input IDs shape: torch.Size([1, 21771]) Labels shape: torch.Size([1, 21771]) Final batch size: 1, sequence length: 16536 Attention mask shape: torch.Size([1, 1, 16536, 16536]) Position ids shape: torch.Size([1, 16536]) Input IDs shape: torch.Size([1, 16536]) Labels shape: torch.Size([1, 16536]) Final batch size: 1, sequence length: 21980 Attention mask shape: torch.Size([1, 1, 21980, 21980]) Position ids shape: torch.Size([1, 21980]) Input IDs shape: torch.Size([1, 21980]) Labels shape: torch.Size([1, 21980]) Final batch size: 1, sequence length: 17665 Attention mask shape: torch.Size([1, 1, 17665, 17665]) Position ids shape: torch.Size([1, 17665]) Input IDs shape: torch.Size([1, 17665]) Labels shape: torch.Size([1, 17665]) Final batch size: 1, sequence length: 12006 Attention mask shape: torch.Size([1, 1, 12006, 12006]) Position ids shape: torch.Size([1, 12006]) Input IDs shape: torch.Size([1, 12006]) Labels shape: torch.Size([1, 12006]) Final batch size: 1, sequence length: 22797 Attention mask shape: torch.Size([1, 1, 22797, 22797]) Position ids shape: torch.Size([1, 22797]) Input IDs shape: torch.Size([1, 22797]) Labels shape: torch.Size([1, 22797]) Final batch size: 1, sequence length: 20554 Attention mask shape: torch.Size([1, 1, 20554, 20554]) Position ids shape: torch.Size([1, 20554]) Input IDs shape: torch.Size([1, 20554]) Labels shape: torch.Size([1, 20554]) Final batch size: 1, sequence length: 21421 Attention mask shape: torch.Size([1, 1, 21421, 21421]) Position ids shape: torch.Size([1, 21421]) Input IDs shape: torch.Size([1, 21421]) Labels shape: torch.Size([1, 21421]) Final batch size: 1, sequence length: 8498 Attention mask shape: torch.Size([1, 1, 8498, 8498]) Position ids shape: torch.Size([1, 8498]) Input IDs shape: torch.Size([1, 8498]) Labels shape: torch.Size([1, 8498]) Final batch size: 1, sequence length: 17625 Attention mask shape: torch.Size([1, 1, 17625, 17625]) Position ids shape: torch.Size([1, 17625]) Input IDs shape: torch.Size([1, 17625]) Labels shape: torch.Size([1, 17625]) Final batch size: 1, sequence length: 22208 Attention mask shape: torch.Size([1, 1, 22208, 22208]) Position ids shape: torch.Size([1, 22208]) Input IDs shape: torch.Size([1, 22208]) Labels shape: torch.Size([1, 22208]) Final batch size: 1, sequence length: 20816 Attention mask shape: torch.Size([1, 1, 20816, 20816]) Position ids shape: torch.Size([1, 20816]) Input IDs shape: torch.Size([1, 20816]) Labels shape: torch.Size([1, 20816]) Final batch size: 1, sequence length: 24515 Attention mask shape: torch.Size([1, 1, 24515, 24515]) Position ids shape: torch.Size([1, 24515]) Input IDs shape: torch.Size([1, 24515]) Labels shape: torch.Size([1, 24515]) Final batch size: 1, sequence length: 20524 Attention mask shape: torch.Size([1, 1, 20524, 20524]) Position ids shape: torch.Size([1, 20524]) Input IDs shape: torch.Size([1, 20524]) Labels shape: torch.Size([1, 20524]) Final batch size: 1, sequence length: 16141 Attention mask shape: torch.Size([1, 1, 16141, 16141]) Position ids shape: torch.Size([1, 16141]) Input IDs shape: torch.Size([1, 16141]) Labels shape: torch.Size([1, 16141]) Final batch size: 1, sequence length: 24308 Attention mask shape: torch.Size([1, 1, 24308, 24308]) Position ids shape: torch.Size([1, 24308]) Input IDs shape: torch.Size([1, 24308]) Labels shape: torch.Size([1, 24308]) Final batch size: 1, sequence length: 16439 Attention mask shape: torch.Size([1, 1, 16439, 16439]) Position ids shape: torch.Size([1, 16439]) Input IDs shape: torch.Size([1, 16439]) Labels shape: torch.Size([1, 16439]) Final batch size: 1, sequence length: 27179 Attention mask shape: torch.Size([1, 1, 27179, 27179]) Position ids shape: torch.Size([1, 27179]) Input IDs shape: torch.Size([1, 27179]) Labels shape: torch.Size([1, 27179]) Final batch size: 1, sequence length: 18991 Attention mask shape: torch.Size([1, 1, 18991, 18991]) Position ids shape: torch.Size([1, 18991]) Input IDs shape: torch.Size([1, 18991]) Labels shape: torch.Size([1, 18991]) Final batch size: 1, sequence length: 26142 Attention mask shape: torch.Size([1, 1, 26142, 26142]) Position ids shape: torch.Size([1, 26142]) Input IDs shape: torch.Size([1, 26142]) Labels shape: torch.Size([1, 26142]) Final batch size: 1, sequence length: 24433 Attention mask shape: torch.Size([1, 1, 24433, 24433]) Position ids shape: torch.Size([1, 24433]) Input IDs shape: torch.Size([1, 24433]) Labels shape: torch.Size([1, 24433]) Final batch size: 1, sequence length: 22683 Attention mask shape: torch.Size([1, 1, 22683, 22683]) Position ids shape: torch.Size([1, 22683]) Input IDs shape: torch.Size([1, 22683]) Labels shape: torch.Size([1, 22683]) Final batch size: 1, sequence length: 21152 Attention mask shape: torch.Size([1, 1, 21152, 21152]) Position ids shape: torch.Size([1, 21152]) Input IDs shape: torch.Size([1, 21152]) Labels shape: torch.Size([1, 21152]) Final batch size: 1, sequence length: 18122 Attention mask shape: torch.Size([1, 1, 18122, 18122]) Position ids shape: torch.Size([1, 18122]) Input IDs shape: torch.Size([1, 18122]) Labels shape: torch.Size([1, 18122]) Final batch size: 1, sequence length: 26449 Attention mask shape: torch.Size([1, 1, 26449, 26449]) Position ids shape: torch.Size([1, 26449]) Input IDs shape: torch.Size([1, 26449]) Labels shape: torch.Size([1, 26449]) Final batch size: 1, sequence length: 17363 Attention mask shape: torch.Size([1, 1, 17363, 17363]) Position ids shape: torch.Size([1, 17363]) Input IDs shape: torch.Size([1, 17363]) Labels shape: torch.Size([1, 17363]) Final batch size: 1, sequence length: 29628 Attention mask shape: torch.Size([1, 1, 29628, 29628]) Position ids shape: torch.Size([1, 29628]) Input IDs shape: torch.Size([1, 29628]) Labels shape: torch.Size([1, 29628]) Final batch size: 1, sequence length: 16564 Attention mask shape: torch.Size([1, 1, 16564, 16564]) Position ids shape: torch.Size([1, 16564]) Input IDs shape: torch.Size([1, 16564]) Labels shape: torch.Size([1, 16564]) Final batch size: 1, sequence length: 24802 Attention mask shape: torch.Size([1, 1, 24802, 24802]) Position ids shape: torch.Size([1, 24802]) Input IDs shape: torch.Size([1, 24802]) Labels shape: torch.Size([1, 24802]) Final batch size: 1, sequence length: 31507 Attention mask shape: torch.Size([1, 1, 31507, 31507]) Position ids shape: torch.Size([1, 31507]) Input IDs shape: torch.Size([1, 31507]) Labels shape: torch.Size([1, 31507]) Final batch size: 1, sequence length: 31339 Attention mask shape: torch.Size([1, 1, 31339, 31339]) Position ids shape: torch.Size([1, 31339]) Input IDs shape: torch.Size([1, 31339]) Labels shape: torch.Size([1, 31339]) Final batch size: 1, sequence length: 16837 Attention mask shape: torch.Size([1, 1, 16837, 16837]) Position ids shape: torch.Size([1, 16837]) Input IDs shape: torch.Size([1, 16837]) Labels shape: torch.Size([1, 16837]) Final batch size: 1, sequence length: 32752 Attention mask shape: torch.Size([1, 1, 32752, 32752]) Position ids shape: torch.Size([1, 32752]) Input IDs shape: torch.Size([1, 32752]) Labels shape: torch.Size([1, 32752]) Final batch size: 1, sequence length: 28164 Attention mask shape: torch.Size([1, 1, 28164, 28164]) Position ids shape: torch.Size([1, 28164]) Input IDs shape: torch.Size([1, 28164]) Labels shape: torch.Size([1, 28164]) Final batch size: 1, sequence length: 21143 Attention mask shape: torch.Size([1, 1, 21143, 21143]) Position ids shape: torch.Size([1, 21143]) Input IDs shape: torch.Size([1, 21143]) Labels shape: torch.Size([1, 21143]) Final batch size: 1, sequence length: 29392 Attention mask shape: torch.Size([1, 1, 29392, 29392]) Position ids shape: torch.Size([1, 29392]) Input IDs shape: torch.Size([1, 29392]) Labels shape: torch.Size([1, 29392]) Final batch size: 1, sequence length: 24527 Attention mask shape: torch.Size([1, 1, 24527, 24527]) Position ids shape: torch.Size([1, 24527]) Input IDs shape: torch.Size([1, 24527]) Labels shape: torch.Size([1, 24527]) Final batch size: 1, sequence length: 30181 Attention mask shape: torch.Size([1, 1, 30181, 30181]) Position ids shape: torch.Size([1, 30181]) Input IDs shape: torch.Size([1, 30181]) Labels shape: torch.Size([1, 30181]) Final batch size: 1, sequence length: 22762 Attention mask shape: torch.Size([1, 1, 22762, 22762]) Position ids shape: torch.Size([1, 22762]) Input IDs shape: torch.Size([1, 22762]) Labels shape: torch.Size([1, 22762]) Final batch size: 1, sequence length: 33794 Attention mask shape: torch.Size([1, 1, 33794, 33794]) Position ids shape: torch.Size([1, 33794]) Input IDs shape: torch.Size([1, 33794]) Labels shape: torch.Size([1, 33794]) Final batch size: 1, sequence length: 30197 Attention mask shape: torch.Size([1, 1, 30197, 30197]) Position ids shape: torch.Size([1, 30197]) Input IDs shape: torch.Size([1, 30197]) Labels shape: torch.Size([1, 30197]) Final batch size: 1, sequence length: 34711 Attention mask shape: torch.Size([1, 1, 34711, 34711]) Position ids shape: torch.Size([1, 34711]) Input IDs shape: torch.Size([1, 34711]) Labels shape: torch.Size([1, 34711]) Final batch size: 1, sequence length: 26121 Attention mask shape: torch.Size([1, 1, 26121, 26121]) Position ids shape: torch.Size([1, 26121]) Input IDs shape: torch.Size([1, 26121]) Labels shape: torch.Size([1, 26121]) Final batch size: 1, sequence length: 32891 Attention mask shape: torch.Size([1, 1, 32891, 32891]) Position ids shape: torch.Size([1, 32891]) Input IDs shape: torch.Size([1, 32891]) Labels shape: torch.Size([1, 32891]) Final batch size: 1, sequence length: 10857 Attention mask shape: torch.Size([1, 1, 10857, 10857]) Position ids shape: torch.Size([1, 10857]) Input IDs shape: torch.Size([1, 10857]) Labels shape: torch.Size([1, 10857]) Final batch size: 1, sequence length: 33894 Attention mask shape: torch.Size([1, 1, 33894, 33894]) Position ids shape: torch.Size([1, 33894]) Input IDs shape: torch.Size([1, 33894]) Labels shape: torch.Size([1, 33894]) Final batch size: 1, sequence length: 37343 Attention mask shape: torch.Size([1, 1, 37343, 37343]) Position ids shape: torch.Size([1, 37343]) Input IDs shape: torch.Size([1, 37343]) Labels shape: torch.Size([1, 37343]) Final batch size: 1, sequence length: 33087 Attention mask shape: torch.Size([1, 1, 33087, 33087]) Position ids shape: torch.Size([1, 33087]) Input IDs shape: torch.Size([1, 33087]) Labels shape: torch.Size([1, 33087]) Final batch size: 1, sequence length: 20274 Attention mask shape: torch.Size([1, 1, 20274, 20274]) Position ids shape: torch.Size([1, 20274]) Input IDs shape: torch.Size([1, 20274]) Labels shape: torch.Size([1, 20274]) Final batch size: 1, sequence length: 34405 Attention mask shape: torch.Size([1, 1, 34405, 34405]) Position ids shape: torch.Size([1, 34405]) Input IDs shape: torch.Size([1, 34405]) Labels shape: torch.Size([1, 34405]) Final batch size: 1, sequence length: 31875 Attention mask shape: torch.Size([1, 1, 31875, 31875]) Position ids shape: torch.Size([1, 31875]) Input IDs shape: torch.Size([1, 31875]) Labels shape: torch.Size([1, 31875]) Final batch size: 1, sequence length: 13942 Attention mask shape: torch.Size([1, 1, 13942, 13942]) Position ids shape: torch.Size([1, 13942]) Input IDs shape: torch.Size([1, 13942]) Labels shape: torch.Size([1, 13942]) Final batch size: 1, sequence length: 11107 Attention mask shape: torch.Size([1, 1, 11107, 11107]) Position ids shape: torch.Size([1, 11107]) Input IDs shape: torch.Size([1, 11107]) Labels shape: torch.Size([1, 11107]) Final batch size: 1, sequence length: 34512 Attention mask shape: torch.Size([1, 1, 34512, 34512]) Position ids shape: torch.Size([1, 34512]) Input IDs shape: torch.Size([1, 34512]) Labels shape: torch.Size([1, 34512]) Final batch size: 1, sequence length: 38010 Attention mask shape: torch.Size([1, 1, 38010, 38010]) Position ids shape: torch.Size([1, 38010]) Input IDs shape: torch.Size([1, 38010]) Labels shape: torch.Size([1, 38010]) Final batch size: 1, sequence length: 37555 Attention mask shape: torch.Size([1, 1, 37555, 37555]) Position ids shape: torch.Size([1, 37555]) Input IDs shape: torch.Size([1, 37555]) Labels shape: torch.Size([1, 37555]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 31561 Attention mask shape: torch.Size([1, 1, 31561, 31561]) Position ids shape: torch.Size([1, 31561]) Input IDs shape: torch.Size([1, 31561]) Labels shape: torch.Size([1, 31561]) Final batch size: 1, sequence length: 17181 Attention mask shape: torch.Size([1, 1, 17181, 17181]) Position ids shape: torch.Size([1, 17181]) Input IDs shape: torch.Size([1, 17181]) Labels shape: torch.Size([1, 17181]) Final batch size: 1, sequence length: 35523 Attention mask shape: torch.Size([1, 1, 35523, 35523]) Position ids shape: torch.Size([1, 35523]) Input IDs shape: torch.Size([1, 35523]) Labels shape: torch.Size([1, 35523]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 32466 Attention mask shape: torch.Size([1, 1, 32466, 32466]) Position ids shape: torch.Size([1, 32466]) Input IDs shape: torch.Size([1, 32466]) Labels shape: torch.Size([1, 32466]) Final batch size: 1, sequence length: 31525 Attention mask shape: torch.Size([1, 1, 31525, 31525]) Position ids shape: torch.Size([1, 31525]) Input IDs shape: torch.Size([1, 31525]) Labels shape: torch.Size([1, 31525]) Final batch size: 1, sequence length: 40910 Attention mask shape: torch.Size([1, 1, 40910, 40910]) Position ids shape: torch.Size([1, 40910]) Input IDs shape: torch.Size([1, 40910]) Labels shape: torch.Size([1, 40910]) Final batch size: 1, sequence length: 33367 Attention mask shape: torch.Size([1, 1, 33367, 33367]) Position ids shape: torch.Size([1, 33367]) Input IDs shape: torch.Size([1, 33367]) Labels shape: torch.Size([1, 33367]) Final batch size: 1, sequence length: 16433 Attention mask shape: torch.Size([1, 1, 16433, 16433]) Position ids shape: torch.Size([1, 16433]) Input IDs shape: torch.Size([1, 16433]) Labels shape: torch.Size([1, 16433]) Final batch size: 1, sequence length: 16716 Attention mask shape: torch.Size([1, 1, 16716, 16716]) Position ids shape: torch.Size([1, 16716]) Input IDs shape: torch.Size([1, 16716]) Labels shape: torch.Size([1, 16716]) Final batch size: 1, sequence length: 31518 Attention mask shape: torch.Size([1, 1, 31518, 31518]) Position ids shape: torch.Size([1, 31518]) Input IDs shape: torch.Size([1, 31518]) Labels shape: torch.Size([1, 31518]) Final batch size: 1, sequence length: 27061 Attention mask shape: torch.Size([1, 1, 27061, 27061]) Position ids shape: torch.Size([1, 27061]) Input IDs shape: torch.Size([1, 27061]) Labels shape: torch.Size([1, 27061]) Final batch size: 1, sequence length: 36464 Attention mask shape: torch.Size([1, 1, 36464, 36464]) Position ids shape: torch.Size([1, 36464]) Input IDs shape: torch.Size([1, 36464]) Labels shape: torch.Size([1, 36464]) Final batch size: 1, sequence length: 19287 Attention mask shape: torch.Size([1, 1, 19287, 19287]) Position ids shape: torch.Size([1, 19287]) Input IDs shape: torch.Size([1, 19287]) Labels shape: torch.Size([1, 19287]) Final batch size: 1, sequence length: 13978 Attention mask shape: torch.Size([1, 1, 13978, 13978]) Position ids shape: torch.Size([1, 13978]) Input IDs shape: torch.Size([1, 13978]) Labels shape: torch.Size([1, 13978]) Final batch size: 1, sequence length: 21859 Attention mask shape: torch.Size([1, 1, 21859, 21859]) Position ids shape: torch.Size([1, 21859]) Input IDs shape: torch.Size([1, 21859]) Labels shape: torch.Size([1, 21859]) Final batch size: 1, sequence length: 29042 Attention mask shape: torch.Size([1, 1, 29042, 29042]) Position ids shape: torch.Size([1, 29042]) Input IDs shape: torch.Size([1, 29042]) Labels shape: torch.Size([1, 29042]) Final batch size: 1, sequence length: 30134 Attention mask shape: torch.Size([1, 1, 30134, 30134]) Position ids shape: torch.Size([1, 30134]) Input IDs shape: torch.Size([1, 30134]) Labels shape: torch.Size([1, 30134]) Final batch size: 1, sequence length: 29216 Attention mask shape: torch.Size([1, 1, 29216, 29216]) Position ids shape: torch.Size([1, 29216]) Input IDs shape: torch.Size([1, 29216]) Labels shape: torch.Size([1, 29216]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40752 Attention mask shape: torch.Size([1, 1, 40752, 40752]) Position ids shape: torch.Size([1, 40752]) Input IDs shape: torch.Size([1, 40752]) Labels shape: torch.Size([1, 40752]) Final batch size: 1, sequence length: 31449 Attention mask shape: torch.Size([1, 1, 31449, 31449]) Position ids shape: torch.Size([1, 31449]) Input IDs shape: torch.Size([1, 31449]) Labels shape: torch.Size([1, 31449]) Final batch size: 1, sequence length: 36450 Attention mask shape: torch.Size([1, 1, 36450, 36450]) Position ids shape: torch.Size([1, 36450]) Input IDs shape: torch.Size([1, 36450]) Labels shape: torch.Size([1, 36450]) Final batch size: 1, sequence length: 17758 Attention mask shape: torch.Size([1, 1, 17758, 17758]) Position ids shape: torch.Size([1, 17758]) Input IDs shape: torch.Size([1, 17758]) Labels shape: torch.Size([1, 17758]) Final batch size: 1, sequence length: 39169 Attention mask shape: torch.Size([1, 1, 39169, 39169]) Position ids shape: torch.Size([1, 39169]) Input IDs shape: torch.Size([1, 39169]) Labels shape: torch.Size([1, 39169]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 17535 Attention mask shape: torch.Size([1, 1, 17535, 17535]) Position ids shape: torch.Size([1, 17535]) Input IDs shape: torch.Size([1, 17535]) Labels shape: torch.Size([1, 17535]) Final batch size: 1, sequence length: 37349 Final batch size: 1, sequence length: 18229 Attention mask shape: torch.Size([1, 1, 37349, 37349]) Position ids shape: torch.Size([1, 37349]) Input IDs shape: torch.Size([1, 37349]) Attention mask shape: torch.Size([1, 1, 18229, 18229]) Labels shape: torch.Size([1, 37349]) Position ids shape: torch.Size([1, 18229]) Input IDs shape: torch.Size([1, 18229]) Labels shape: torch.Size([1, 18229]) Final batch size: 1, sequence length: 38529 Attention mask shape: torch.Size([1, 1, 38529, 38529]) Position ids shape: torch.Size([1, 38529]) Input IDs shape: torch.Size([1, 38529]) Labels shape: torch.Size([1, 38529]) Final batch size: 1, sequence length: 26893 Attention mask shape: torch.Size([1, 1, 26893, 26893]) Position ids shape: torch.Size([1, 26893]) Input IDs shape: torch.Size([1, 26893]) Labels shape: torch.Size([1, 26893]) Final batch size: 1, sequence length: 38891 Attention mask shape: torch.Size([1, 1, 38891, 38891]) Position ids shape: torch.Size([1, 38891]) Input IDs shape: torch.Size([1, 38891]) Labels shape: torch.Size([1, 38891]) Final batch size: 1, sequence length: 39258 Attention mask shape: torch.Size([1, 1, 39258, 39258]) Position ids shape: torch.Size([1, 39258]) Input IDs shape: torch.Size([1, 39258]) Labels shape: torch.Size([1, 39258]) Final batch size: 1, sequence length: 17890 Attention mask shape: torch.Size([1, 1, 17890, 17890]) Position ids shape: torch.Size([1, 17890]) Input IDs shape: torch.Size([1, 17890]) Labels shape: torch.Size([1, 17890]) Final batch size: 1, sequence length: 26922 Attention mask shape: torch.Size([1, 1, 26922, 26922]) Position ids shape: torch.Size([1, 26922]) Input IDs shape: torch.Size([1, 26922]) Labels shape: torch.Size([1, 26922]) Final batch size: 1, sequence length: 32638 Attention mask shape: torch.Size([1, 1, 32638, 32638]) Position ids shape: torch.Size([1, 32638]) Input IDs shape: torch.Size([1, 32638]) Labels shape: torch.Size([1, 32638]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 36599 Attention mask shape: torch.Size([1, 1, 36599, 36599]) Position ids shape: torch.Size([1, 36599]) Input IDs shape: torch.Size([1, 36599]) Labels shape: torch.Size([1, 36599]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 32465 Attention mask shape: torch.Size([1, 1, 32465, 32465]) Position ids shape: torch.Size([1, 32465]) Input IDs shape: torch.Size([1, 32465]) Labels shape: torch.Size([1, 32465]) Final batch size: 1, sequence length: 18471 Attention mask shape: torch.Size([1, 1, 18471, 18471]) Position ids shape: torch.Size([1, 18471]) Input IDs shape: torch.Size([1, 18471]) Labels shape: torch.Size([1, 18471]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 39953 Attention mask shape: torch.Size([1, 1, 39953, 39953]) Position ids shape: torch.Size([1, 39953]) Input IDs shape: torch.Size([1, 39953]) Labels shape: torch.Size([1, 39953]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) {'loss': 0.2814, 'grad_norm': 0.3697162338624912, 'learning_rate': 7.033683215379002e-06, 'num_tokens': -inf, 'epoch': 3.38} Final batch size: 1, sequence length: 5818 Attention mask shape: torch.Size([1, 1, 5818, 5818]) Position ids shape: torch.Size([1, 5818]) Input IDs shape: torch.Size([1, 5818]) Labels shape: torch.Size([1, 5818]) Final batch size: 1, sequence length: 7977 Attention mask shape: torch.Size([1, 1, 7977, 7977]) Position ids shape: torch.Size([1, 7977]) Input IDs shape: torch.Size([1, 7977]) Labels shape: torch.Size([1, 7977]) Final batch size: 1, sequence length: 6215 Attention mask shape: torch.Size([1, 1, 6215, 6215]) Position ids shape: torch.Size([1, 6215]) Input IDs shape: torch.Size([1, 6215]) Labels shape: torch.Size([1, 6215]) Final batch size: 1, sequence length: 10107 Attention mask shape: torch.Size([1, 1, 10107, 10107]) Position ids shape: torch.Size([1, 10107]) Input IDs shape: torch.Size([1, 10107]) Labels shape: torch.Size([1, 10107]) Final batch size: 1, sequence length: 6095 Attention mask shape: torch.Size([1, 1, 6095, 6095]) Position ids shape: torch.Size([1, 6095]) Input IDs shape: torch.Size([1, 6095]) Labels shape: torch.Size([1, 6095]) Final batch size: 1, sequence length: 8500 Attention mask shape: torch.Size([1, 1, 8500, 8500]) Position ids shape: torch.Size([1, 8500]) Input IDs shape: torch.Size([1, 8500]) Labels shape: torch.Size([1, 8500]) Final batch size: 1, sequence length: 12928 Attention mask shape: torch.Size([1, 1, 12928, 12928]) Position ids shape: torch.Size([1, 12928]) Input IDs shape: torch.Size([1, 12928]) Labels shape: torch.Size([1, 12928]) Final batch size: 1, sequence length: 12826 Attention mask shape: torch.Size([1, 1, 12826, 12826]) Position ids shape: torch.Size([1, 12826]) Input IDs shape: torch.Size([1, 12826]) Labels shape: torch.Size([1, 12826]) Final batch size: 1, sequence length: 13624 Attention mask shape: torch.Size([1, 1, 13624, 13624]) Position ids shape: torch.Size([1, 13624]) Input IDs shape: torch.Size([1, 13624]) Labels shape: torch.Size([1, 13624]) Final batch size: 1, sequence length: 13092 Attention mask shape: torch.Size([1, 1, 13092, 13092]) Position ids shape: torch.Size([1, 13092]) Input IDs shape: torch.Size([1, 13092]) Labels shape: torch.Size([1, 13092]) Final batch size: 1, sequence length: 9217 Attention mask shape: torch.Size([1, 1, 9217, 9217]) Position ids shape: torch.Size([1, 9217]) Input IDs shape: torch.Size([1, 9217]) Labels shape: torch.Size([1, 9217]) Final batch size: 1, sequence length: 13459 Attention mask shape: torch.Size([1, 1, 13459, 13459]) Position ids shape: torch.Size([1, 13459]) Input IDs shape: torch.Size([1, 13459]) Labels shape: torch.Size([1, 13459]) Final batch size: 1, sequence length: 10687 Attention mask shape: torch.Size([1, 1, 10687, 10687]) Position ids shape: torch.Size([1, 10687]) Input IDs shape: torch.Size([1, 10687]) Labels shape: torch.Size([1, 10687]) Final batch size: 1, sequence length: 12556 Attention mask shape: torch.Size([1, 1, 12556, 12556]) Position ids shape: torch.Size([1, 12556]) Input IDs shape: torch.Size([1, 12556]) Labels shape: torch.Size([1, 12556]) Final batch size: 1, sequence length: 10080 Attention mask shape: torch.Size([1, 1, 10080, 10080]) Position ids shape: torch.Size([1, 10080]) Input IDs shape: torch.Size([1, 10080]) Labels shape: torch.Size([1, 10080]) Final batch size: 1, sequence length: 10505 Attention mask shape: torch.Size([1, 1, 10505, 10505]) Position ids shape: torch.Size([1, 10505]) Input IDs shape: torch.Size([1, 10505]) Labels shape: torch.Size([1, 10505]) Final batch size: 1, sequence length: 9379 Attention mask shape: torch.Size([1, 1, 9379, 9379]) Position ids shape: torch.Size([1, 9379]) Input IDs shape: torch.Size([1, 9379]) Labels shape: torch.Size([1, 9379]) Final batch size: 1, sequence length: 16782 Attention mask shape: torch.Size([1, 1, 16782, 16782]) Position ids shape: torch.Size([1, 16782]) Input IDs shape: torch.Size([1, 16782]) Labels shape: torch.Size([1, 16782]) Final batch size: 1, sequence length: 12454 Attention mask shape: torch.Size([1, 1, 12454, 12454]) Position ids shape: torch.Size([1, 12454]) Input IDs shape: torch.Size([1, 12454]) Labels shape: torch.Size([1, 12454]) Final batch size: 1, sequence length: 17980 Attention mask shape: torch.Size([1, 1, 17980, 17980]) Position ids shape: torch.Size([1, 17980]) Input IDs shape: torch.Size([1, 17980]) Labels shape: torch.Size([1, 17980]) Final batch size: 1, sequence length: 10469 Attention mask shape: torch.Size([1, 1, 10469, 10469]) Position ids shape: torch.Size([1, 10469]) Input IDs shape: torch.Size([1, 10469]) Labels shape: torch.Size([1, 10469]) Final batch size: 1, sequence length: 18454 Attention mask shape: torch.Size([1, 1, 18454, 18454]) Position ids shape: torch.Size([1, 18454]) Input IDs shape: torch.Size([1, 18454]) Labels shape: torch.Size([1, 18454]) Final batch size: 1, sequence length: 10277 Attention mask shape: torch.Size([1, 1, 10277, 10277]) Position ids shape: torch.Size([1, 10277]) Input IDs shape: torch.Size([1, 10277]) Labels shape: torch.Size([1, 10277]) Final batch size: 1, sequence length: 19671 Attention mask shape: torch.Size([1, 1, 19671, 19671]) Position ids shape: torch.Size([1, 19671]) Input IDs shape: torch.Size([1, 19671]) Labels shape: torch.Size([1, 19671]) Final batch size: 1, sequence length: 15185 Attention mask shape: torch.Size([1, 1, 15185, 15185]) Position ids shape: torch.Size([1, 15185]) Input IDs shape: torch.Size([1, 15185]) Labels shape: torch.Size([1, 15185]) Final batch size: 1, sequence length: 19683 Attention mask shape: torch.Size([1, 1, 19683, 19683]) Position ids shape: torch.Size([1, 19683]) Input IDs shape: torch.Size([1, 19683]) Labels shape: torch.Size([1, 19683]) Final batch size: 1, sequence length: 10390 Attention mask shape: torch.Size([1, 1, 10390, 10390]) Position ids shape: torch.Size([1, 10390]) Input IDs shape: torch.Size([1, 10390]) Labels shape: torch.Size([1, 10390]) Final batch size: 1, sequence length: 18469 Attention mask shape: torch.Size([1, 1, 18469, 18469]) Position ids shape: torch.Size([1, 18469]) Input IDs shape: torch.Size([1, 18469]) Labels shape: torch.Size([1, 18469]) Final batch size: 1, sequence length: 20243 Attention mask shape: torch.Size([1, 1, 20243, 20243]) Position ids shape: torch.Size([1, 20243]) Input IDs shape: torch.Size([1, 20243]) Labels shape: torch.Size([1, 20243]) Final batch size: 1, sequence length: 15263 Attention mask shape: torch.Size([1, 1, 15263, 15263]) Position ids shape: torch.Size([1, 15263]) Input IDs shape: torch.Size([1, 15263]) Labels shape: torch.Size([1, 15263]) Final batch size: 1, sequence length: 20191 Attention mask shape: torch.Size([1, 1, 20191, 20191]) Position ids shape: torch.Size([1, 20191]) Input IDs shape: torch.Size([1, 20191]) Labels shape: torch.Size([1, 20191]) Final batch size: 1, sequence length: 21051 Attention mask shape: torch.Size([1, 1, 21051, 21051]) Position ids shape: torch.Size([1, 21051]) Input IDs shape: torch.Size([1, 21051]) Labels shape: torch.Size([1, 21051]) Final batch size: 1, sequence length: 17417 Attention mask shape: torch.Size([1, 1, 17417, 17417]) Position ids shape: torch.Size([1, 17417]) Input IDs shape: torch.Size([1, 17417]) Labels shape: torch.Size([1, 17417]) Final batch size: 1, sequence length: 16585 Attention mask shape: torch.Size([1, 1, 16585, 16585]) Position ids shape: torch.Size([1, 16585]) Input IDs shape: torch.Size([1, 16585]) Labels shape: torch.Size([1, 16585]) Final batch size: 1, sequence length: 10182 Attention mask shape: torch.Size([1, 1, 10182, 10182]) Position ids shape: torch.Size([1, 10182]) Input IDs shape: torch.Size([1, 10182]) Labels shape: torch.Size([1, 10182]) Final batch size: 1, sequence length: 23238 Attention mask shape: torch.Size([1, 1, 23238, 23238]) Position ids shape: torch.Size([1, 23238]) Input IDs shape: torch.Size([1, 23238]) Labels shape: torch.Size([1, 23238]) Final batch size: 1, sequence length: 15339 Attention mask shape: torch.Size([1, 1, 15339, 15339]) Position ids shape: torch.Size([1, 15339]) Input IDs shape: torch.Size([1, 15339]) Labels shape: torch.Size([1, 15339]) Final batch size: 1, sequence length: 24255 Attention mask shape: torch.Size([1, 1, 24255, 24255]) Position ids shape: torch.Size([1, 24255]) Input IDs shape: torch.Size([1, 24255]) Labels shape: torch.Size([1, 24255]) Final batch size: 1, sequence length: 20432 Attention mask shape: torch.Size([1, 1, 20432, 20432]) Position ids shape: torch.Size([1, 20432]) Input IDs shape: torch.Size([1, 20432]) Labels shape: torch.Size([1, 20432]) Final batch size: 1, sequence length: 14892 Attention mask shape: torch.Size([1, 1, 14892, 14892]) Position ids shape: torch.Size([1, 14892]) Input IDs shape: torch.Size([1, 14892]) Labels shape: torch.Size([1, 14892]) Final batch size: 1, sequence length: 24769 Attention mask shape: torch.Size([1, 1, 24769, 24769]) Position ids shape: torch.Size([1, 24769]) Input IDs shape: torch.Size([1, 24769]) Labels shape: torch.Size([1, 24769]) Final batch size: 1, sequence length: 21664 Attention mask shape: torch.Size([1, 1, 21664, 21664]) Position ids shape: torch.Size([1, 21664]) Input IDs shape: torch.Size([1, 21664]) Labels shape: torch.Size([1, 21664]) Final batch size: 1, sequence length: 19187 Attention mask shape: torch.Size([1, 1, 19187, 19187]) Position ids shape: torch.Size([1, 19187]) Input IDs shape: torch.Size([1, 19187]) Labels shape: torch.Size([1, 19187]) Final batch size: 1, sequence length: 24002 Attention mask shape: torch.Size([1, 1, 24002, 24002]) Position ids shape: torch.Size([1, 24002]) Input IDs shape: torch.Size([1, 24002]) Labels shape: torch.Size([1, 24002]) Final batch size: 1, sequence length: 12012 Attention mask shape: torch.Size([1, 1, 12012, 12012]) Position ids shape: torch.Size([1, 12012]) Input IDs shape: torch.Size([1, 12012]) Labels shape: torch.Size([1, 12012]) Final batch size: 1, sequence length: 22857 Attention mask shape: torch.Size([1, 1, 22857, 22857]) Position ids shape: torch.Size([1, 22857]) Input IDs shape: torch.Size([1, 22857]) Labels shape: torch.Size([1, 22857]) Final batch size: 1, sequence length: 14556 Attention mask shape: torch.Size([1, 1, 14556, 14556]) Position ids shape: torch.Size([1, 14556]) Input IDs shape: torch.Size([1, 14556]) Labels shape: torch.Size([1, 14556]) Final batch size: 1, sequence length: 24428 Attention mask shape: torch.Size([1, 1, 24428, 24428]) Position ids shape: torch.Size([1, 24428]) Input IDs shape: torch.Size([1, 24428]) Labels shape: torch.Size([1, 24428]) Final batch size: 1, sequence length: 22133 Attention mask shape: torch.Size([1, 1, 22133, 22133]) Position ids shape: torch.Size([1, 22133]) Input IDs shape: torch.Size([1, 22133]) Labels shape: torch.Size([1, 22133]) Final batch size: 1, sequence length: 24858 Attention mask shape: torch.Size([1, 1, 24858, 24858]) Position ids shape: torch.Size([1, 24858]) Input IDs shape: torch.Size([1, 24858]) Labels shape: torch.Size([1, 24858]) Final batch size: 1, sequence length: 20562 Attention mask shape: torch.Size([1, 1, 20562, 20562]) Position ids shape: torch.Size([1, 20562]) Input IDs shape: torch.Size([1, 20562]) Labels shape: torch.Size([1, 20562]) Final batch size: 1, sequence length: 26033 Attention mask shape: torch.Size([1, 1, 26033, 26033]) Position ids shape: torch.Size([1, 26033]) Input IDs shape: torch.Size([1, 26033]) Labels shape: torch.Size([1, 26033]) Final batch size: 1, sequence length: 26312 Attention mask shape: torch.Size([1, 1, 26312, 26312]) Position ids shape: torch.Size([1, 26312]) Input IDs shape: torch.Size([1, 26312]) Labels shape: torch.Size([1, 26312]) Final batch size: 1, sequence length: 26144 Attention mask shape: torch.Size([1, 1, 26144, 26144]) Position ids shape: torch.Size([1, 26144]) Input IDs shape: torch.Size([1, 26144]) Labels shape: torch.Size([1, 26144]) Final batch size: 1, sequence length: 16541 Attention mask shape: torch.Size([1, 1, 16541, 16541]) Position ids shape: torch.Size([1, 16541]) Input IDs shape: torch.Size([1, 16541]) Labels shape: torch.Size([1, 16541]) Final batch size: 1, sequence length: 30079 Attention mask shape: torch.Size([1, 1, 30079, 30079]) Position ids shape: torch.Size([1, 30079]) Input IDs shape: torch.Size([1, 30079]) Labels shape: torch.Size([1, 30079]) Final batch size: 1, sequence length: 25707 Attention mask shape: torch.Size([1, 1, 25707, 25707]) Position ids shape: torch.Size([1, 25707]) Input IDs shape: torch.Size([1, 25707]) Labels shape: torch.Size([1, 25707]) Final batch size: 1, sequence length: 25758 Attention mask shape: torch.Size([1, 1, 25758, 25758]) Position ids shape: torch.Size([1, 25758]) Input IDs shape: torch.Size([1, 25758]) Labels shape: torch.Size([1, 25758]) Final batch size: 1, sequence length: 23602 Attention mask shape: torch.Size([1, 1, 23602, 23602]) Position ids shape: torch.Size([1, 23602]) Input IDs shape: torch.Size([1, 23602]) Labels shape: torch.Size([1, 23602]) Final batch size: 1, sequence length: 25886 Attention mask shape: torch.Size([1, 1, 25886, 25886]) Position ids shape: torch.Size([1, 25886]) Input IDs shape: torch.Size([1, 25886]) Labels shape: torch.Size([1, 25886]) Final batch size: 1, sequence length: 10318 Attention mask shape: torch.Size([1, 1, 10318, 10318]) Position ids shape: torch.Size([1, 10318]) Input IDs shape: torch.Size([1, 10318]) Labels shape: torch.Size([1, 10318]) Final batch size: 1, sequence length: 23698 Attention mask shape: torch.Size([1, 1, 23698, 23698]) Position ids shape: torch.Size([1, 23698]) Input IDs shape: torch.Size([1, 23698]) Labels shape: torch.Size([1, 23698]) Final batch size: 1, sequence length: 29825 Attention mask shape: torch.Size([1, 1, 29825, 29825]) Position ids shape: torch.Size([1, 29825]) Input IDs shape: torch.Size([1, 29825]) Labels shape: torch.Size([1, 29825]) Final batch size: 1, sequence length: 29749 Attention mask shape: torch.Size([1, 1, 29749, 29749]) Position ids shape: torch.Size([1, 29749]) Input IDs shape: torch.Size([1, 29749]) Labels shape: torch.Size([1, 29749]) Final batch size: 1, sequence length: 21374 Attention mask shape: torch.Size([1, 1, 21374, 21374]) Position ids shape: torch.Size([1, 21374]) Input IDs shape: torch.Size([1, 21374]) Labels shape: torch.Size([1, 21374]) Final batch size: 1, sequence length: 25719 Attention mask shape: torch.Size([1, 1, 25719, 25719]) Position ids shape: torch.Size([1, 25719]) Input IDs shape: torch.Size([1, 25719]) Labels shape: torch.Size([1, 25719]) Final batch size: 1, sequence length: 25656 Attention mask shape: torch.Size([1, 1, 25656, 25656]) Position ids shape: torch.Size([1, 25656]) Input IDs shape: torch.Size([1, 25656]) Labels shape: torch.Size([1, 25656]) Final batch size: 1, sequence length: 29948 Attention mask shape: torch.Size([1, 1, 29948, 29948]) Position ids shape: torch.Size([1, 29948]) Input IDs shape: torch.Size([1, 29948]) Labels shape: torch.Size([1, 29948]) Final batch size: 1, sequence length: 33611 Attention mask shape: torch.Size([1, 1, 33611, 33611]) Position ids shape: torch.Size([1, 33611]) Input IDs shape: torch.Size([1, 33611]) Labels shape: torch.Size([1, 33611]) Final batch size: 1, sequence length: 32071 Attention mask shape: torch.Size([1, 1, 32071, 32071]) Position ids shape: torch.Size([1, 32071]) Input IDs shape: torch.Size([1, 32071]) Labels shape: torch.Size([1, 32071]) Final batch size: 1, sequence length: 32541 Attention mask shape: torch.Size([1, 1, 32541, 32541]) Position ids shape: torch.Size([1, 32541]) Input IDs shape: torch.Size([1, 32541]) Labels shape: torch.Size([1, 32541]) Final batch size: 1, sequence length: 23399 Attention mask shape: torch.Size([1, 1, 23399, 23399]) Position ids shape: torch.Size([1, 23399]) Input IDs shape: torch.Size([1, 23399]) Labels shape: torch.Size([1, 23399]) Final batch size: 1, sequence length: 9897 Attention mask shape: torch.Size([1, 1, 9897, 9897]) Position ids shape: torch.Size([1, 9897]) Input IDs shape: torch.Size([1, 9897]) Labels shape: torch.Size([1, 9897]) Final batch size: 1, sequence length: 25492 Attention mask shape: torch.Size([1, 1, 25492, 25492]) Position ids shape: torch.Size([1, 25492]) Input IDs shape: torch.Size([1, 25492]) Labels shape: torch.Size([1, 25492]) Final batch size: 1, sequence length: 22139 Attention mask shape: torch.Size([1, 1, 22139, 22139]) Position ids shape: torch.Size([1, 22139]) Input IDs shape: torch.Size([1, 22139]) Labels shape: torch.Size([1, 22139]) Final batch size: 1, sequence length: 23343 Attention mask shape: torch.Size([1, 1, 23343, 23343]) Position ids shape: torch.Size([1, 23343]) Input IDs shape: torch.Size([1, 23343]) Labels shape: torch.Size([1, 23343]) Final batch size: 1, sequence length: 34186 Attention mask shape: torch.Size([1, 1, 34186, 34186]) Position ids shape: torch.Size([1, 34186]) Input IDs shape: torch.Size([1, 34186]) Labels shape: torch.Size([1, 34186]) Final batch size: 1, sequence length: 37241 Attention mask shape: torch.Size([1, 1, 37241, 37241]) Position ids shape: torch.Size([1, 37241]) Input IDs shape: torch.Size([1, 37241]) Labels shape: torch.Size([1, 37241]) Final batch size: 1, sequence length: 34701 Attention mask shape: torch.Size([1, 1, 34701, 34701]) Position ids shape: torch.Size([1, 34701]) Input IDs shape: torch.Size([1, 34701]) Labels shape: torch.Size([1, 34701]) Final batch size: 1, sequence length: 19238 Attention mask shape: torch.Size([1, 1, 19238, 19238]) Position ids shape: torch.Size([1, 19238]) Input IDs shape: torch.Size([1, 19238]) Labels shape: torch.Size([1, 19238]) Final batch size: 1, sequence length: 15604 Attention mask shape: torch.Size([1, 1, 15604, 15604]) Position ids shape: torch.Size([1, 15604]) Input IDs shape: torch.Size([1, 15604]) Labels shape: torch.Size([1, 15604]) Final batch size: 1, sequence length: 37728 Attention mask shape: torch.Size([1, 1, 37728, 37728]) Position ids shape: torch.Size([1, 37728]) Input IDs shape: torch.Size([1, 37728]) Labels shape: torch.Size([1, 37728]) Final batch size: 1, sequence length: 33459 Attention mask shape: torch.Size([1, 1, 33459, 33459]) Position ids shape: torch.Size([1, 33459]) Input IDs shape: torch.Size([1, 33459]) Labels shape: torch.Size([1, 33459]) Final batch size: 1, sequence length: 40593 Attention mask shape: torch.Size([1, 1, 40593, 40593]) Position ids shape: torch.Size([1, 40593]) Input IDs shape: torch.Size([1, 40593]) Labels shape: torch.Size([1, 40593]) Final batch size: 1, sequence length: 21557 Attention mask shape: torch.Size([1, 1, 21557, 21557]) Position ids shape: torch.Size([1, 21557]) Input IDs shape: torch.Size([1, 21557]) Labels shape: torch.Size([1, 21557]) Final batch size: 1, sequence length: 36456 Attention mask shape: torch.Size([1, 1, 36456, 36456]) Position ids shape: torch.Size([1, 36456]) Input IDs shape: torch.Size([1, 36456]) Labels shape: torch.Size([1, 36456]) Final batch size: 1, sequence length: 36778 Attention mask shape: torch.Size([1, 1, 36778, 36778]) Position ids shape: torch.Size([1, 36778]) Input IDs shape: torch.Size([1, 36778]) Labels shape: torch.Size([1, 36778]) Final batch size: 1, sequence length: 31650 Attention mask shape: torch.Size([1, 1, 31650, 31650]) Position ids shape: torch.Size([1, 31650]) Input IDs shape: torch.Size([1, 31650]) Labels shape: torch.Size([1, 31650]) Final batch size: 1, sequence length: 39760 Attention mask shape: torch.Size([1, 1, 39760, 39760]) Position ids shape: torch.Size([1, 39760]) Input IDs shape: torch.Size([1, 39760]) Labels shape: torch.Size([1, 39760]) Final batch size: 1, sequence length: 15294 Attention mask shape: torch.Size([1, 1, 15294, 15294]) Position ids shape: torch.Size([1, 15294]) Input IDs shape: torch.Size([1, 15294]) Labels shape: torch.Size([1, 15294]) Final batch size: 1, sequence length: 31323 Attention mask shape: torch.Size([1, 1, 31323, 31323]) Position ids shape: torch.Size([1, 31323]) Input IDs shape: torch.Size([1, 31323]) Labels shape: torch.Size([1, 31323]) Final batch size: 1, sequence length: 21955 Attention mask shape: torch.Size([1, 1, 21955, 21955]) Position ids shape: torch.Size([1, 21955]) Input IDs shape: torch.Size([1, 21955]) Labels shape: torch.Size([1, 21955]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 22491 Attention mask shape: torch.Size([1, 1, 22491, 22491]) Position ids shape: torch.Size([1, 22491]) Input IDs shape: torch.Size([1, 22491]) Labels shape: torch.Size([1, 22491]) Final batch size: 1, sequence length: 9445 Attention mask shape: torch.Size([1, 1, 9445, 9445]) Position ids shape: torch.Size([1, 9445]) Input IDs shape: torch.Size([1, 9445]) Labels shape: torch.Size([1, 9445]) Final batch size: 1, sequence length: 35760 Attention mask shape: torch.Size([1, 1, 35760, 35760]) Position ids shape: torch.Size([1, 35760]) Input IDs shape: torch.Size([1, 35760]) Labels shape: torch.Size([1, 35760]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 26664 Attention mask shape: torch.Size([1, 1, 26664, 26664]) Position ids shape: torch.Size([1, 26664]) Input IDs shape: torch.Size([1, 26664]) Labels shape: torch.Size([1, 26664]) Final batch size: 1, sequence length: 27283 Attention mask shape: torch.Size([1, 1, 27283, 27283]) Position ids shape: torch.Size([1, 27283]) Input IDs shape: torch.Size([1, 27283]) Labels shape: torch.Size([1, 27283]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 22434 Attention mask shape: torch.Size([1, 1, 22434, 22434]) Position ids shape: torch.Size([1, 22434]) Input IDs shape: torch.Size([1, 22434]) Labels shape: torch.Size([1, 22434]) Final batch size: 1, sequence length: 19867 Attention mask shape: torch.Size([1, 1, 19867, 19867]) Position ids shape: torch.Size([1, 19867]) Input IDs shape: torch.Size([1, 19867]) Labels shape: torch.Size([1, 19867]) Final batch size: 1, sequence length: 37775 Attention mask shape: torch.Size([1, 1, 37775, 37775]) Position ids shape: torch.Size([1, 37775]) Input IDs shape: torch.Size([1, 37775]) Labels shape: torch.Size([1, 37775]) Final batch size: 1, sequence length: 17646 Attention mask shape: torch.Size([1, 1, 17646, 17646]) Position ids shape: torch.Size([1, 17646]) Input IDs shape: torch.Size([1, 17646]) Labels shape: torch.Size([1, 17646]) Final batch size: 1, sequence length: 40507 Attention mask shape: torch.Size([1, 1, 40507, 40507]) Position ids shape: torch.Size([1, 40507]) Input IDs shape: torch.Size([1, 40507]) Labels shape: torch.Size([1, 40507]) Final batch size: 1, sequence length: 40147 Attention mask shape: torch.Size([1, 1, 40147, 40147]) Position ids shape: torch.Size([1, 40147]) Input IDs shape: torch.Size([1, 40147]) Labels shape: torch.Size([1, 40147]) Final batch size: 1, sequence length: 7681 Attention mask shape: torch.Size([1, 1, 7681, 7681]) Position ids shape: torch.Size([1, 7681]) Input IDs shape: torch.Size([1, 7681]) Labels shape: torch.Size([1, 7681]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 29082 Attention mask shape: torch.Size([1, 1, 29082, 29082]) Position ids shape: torch.Size([1, 29082]) Input IDs shape: torch.Size([1, 29082]) Labels shape: torch.Size([1, 29082]) Final batch size: 1, sequence length: 37183 Attention mask shape: torch.Size([1, 1, 37183, 37183]) Position ids shape: torch.Size([1, 37183]) Input IDs shape: torch.Size([1, 37183]) Labels shape: torch.Size([1, 37183]) Final batch size: 1, sequence length: 23014 Attention mask shape: torch.Size([1, 1, 23014, 23014]) Position ids shape: torch.Size([1, 23014]) Input IDs shape: torch.Size([1, 23014]) Labels shape: torch.Size([1, 23014]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 16273 Attention mask shape: torch.Size([1, 1, 16273, 16273]) Position ids shape: torch.Size([1, 16273]) Input IDs shape: torch.Size([1, 16273]) Labels shape: torch.Size([1, 16273]) Final batch size: 1, sequence length: 21489 Attention mask shape: torch.Size([1, 1, 21489, 21489]) Position ids shape: torch.Size([1, 21489]) Input IDs shape: torch.Size([1, 21489]) Labels shape: torch.Size([1, 21489]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 36579 Attention mask shape: torch.Size([1, 1, 36579, 36579]) Position ids shape: torch.Size([1, 36579]) Input IDs shape: torch.Size([1, 36579]) Labels shape: torch.Size([1, 36579]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 37091 Attention mask shape: torch.Size([1, 1, 37091, 37091]) Position ids shape: torch.Size([1, 37091]) Input IDs shape: torch.Size([1, 37091]) Labels shape: torch.Size([1, 37091]) Final batch size: 1, sequence length: 30364 Attention mask shape: torch.Size([1, 1, 30364, 30364]) Position ids shape: torch.Size([1, 30364]) Input IDs shape: torch.Size([1, 30364]) Labels shape: torch.Size([1, 30364]) Final batch size: 1, sequence length: 38686 Attention mask shape: torch.Size([1, 1, 38686, 38686]) Position ids shape: torch.Size([1, 38686]) Input IDs shape: torch.Size([1, 38686]) Labels shape: torch.Size([1, 38686]) Final batch size: 1, sequence length: 33263 Attention mask shape: torch.Size([1, 1, 33263, 33263]) Position ids shape: torch.Size([1, 33263]) Input IDs shape: torch.Size([1, 33263]) Labels shape: torch.Size([1, 33263]) Final batch size: 1, sequence length: 25914 Attention mask shape: torch.Size([1, 1, 25914, 25914]) Position ids shape: torch.Size([1, 25914]) Input IDs shape: torch.Size([1, 25914]) Labels shape: torch.Size([1, 25914]) Final batch size: 1, sequence length: 28661 Attention mask shape: torch.Size([1, 1, 28661, 28661]) Position ids shape: torch.Size([1, 28661]) Input IDs shape: torch.Size([1, 28661]) Labels shape: torch.Size([1, 28661]) Final batch size: 1, sequence length: 36297 Attention mask shape: torch.Size([1, 1, 36297, 36297]) Position ids shape: torch.Size([1, 36297]) Input IDs shape: torch.Size([1, 36297]) Labels shape: torch.Size([1, 36297]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) {'loss': 0.2814, 'grad_norm': 0.3005252982311504, 'learning_rate': 6.7918397477265e-06, 'num_tokens': -inf, 'epoch': 3.5} Final batch size: 1, sequence length: 6519 Attention mask shape: torch.Size([1, 1, 6519, 6519]) Position ids shape: torch.Size([1, 6519]) Input IDs shape: torch.Size([1, 6519]) Labels shape: torch.Size([1, 6519]) Final batch size: 1, sequence length: 5525 Attention mask shape: torch.Size([1, 1, 5525, 5525]) Position ids shape: torch.Size([1, 5525]) Input IDs shape: torch.Size([1, 5525]) Labels shape: torch.Size([1, 5525]) Final batch size: 1, sequence length: 10273 Attention mask shape: torch.Size([1, 1, 10273, 10273]) Position ids shape: torch.Size([1, 10273]) Input IDs shape: torch.Size([1, 10273]) Labels shape: torch.Size([1, 10273]) Final batch size: 1, sequence length: 10408 Attention mask shape: torch.Size([1, 1, 10408, 10408]) Position ids shape: torch.Size([1, 10408]) Input IDs shape: torch.Size([1, 10408]) Labels shape: torch.Size([1, 10408]) Final batch size: 1, sequence length: 12281 Attention mask shape: torch.Size([1, 1, 12281, 12281]) Position ids shape: torch.Size([1, 12281]) Input IDs shape: torch.Size([1, 12281]) Labels shape: torch.Size([1, 12281]) Final batch size: 1, sequence length: 13363 Attention mask shape: torch.Size([1, 1, 13363, 13363]) Position ids shape: torch.Size([1, 13363]) Input IDs shape: torch.Size([1, 13363]) Labels shape: torch.Size([1, 13363]) Final batch size: 1, sequence length: 13385 Attention mask shape: torch.Size([1, 1, 13385, 13385]) Position ids shape: torch.Size([1, 13385]) Input IDs shape: torch.Size([1, 13385]) Labels shape: torch.Size([1, 13385]) Final batch size: 1, sequence length: 9181 Attention mask shape: torch.Size([1, 1, 9181, 9181]) Position ids shape: torch.Size([1, 9181]) Input IDs shape: torch.Size([1, 9181]) Labels shape: torch.Size([1, 9181]) Final batch size: 1, sequence length: 13804 Attention mask shape: torch.Size([1, 1, 13804, 13804]) Position ids shape: torch.Size([1, 13804]) Input IDs shape: torch.Size([1, 13804]) Labels shape: torch.Size([1, 13804]) Final batch size: 1, sequence length: 12927 Attention mask shape: torch.Size([1, 1, 12927, 12927]) Position ids shape: torch.Size([1, 12927]) Input IDs shape: torch.Size([1, 12927]) Labels shape: torch.Size([1, 12927]) Final batch size: 1, sequence length: 15518 Attention mask shape: torch.Size([1, 1, 15518, 15518]) Position ids shape: torch.Size([1, 15518]) Input IDs shape: torch.Size([1, 15518]) Labels shape: torch.Size([1, 15518]) Final batch size: 1, sequence length: 10905 Attention mask shape: torch.Size([1, 1, 10905, 10905]) Position ids shape: torch.Size([1, 10905]) Input IDs shape: torch.Size([1, 10905]) Labels shape: torch.Size([1, 10905]) Final batch size: 1, sequence length: 17003 Attention mask shape: torch.Size([1, 1, 17003, 17003]) Position ids shape: torch.Size([1, 17003]) Input IDs shape: torch.Size([1, 17003]) Labels shape: torch.Size([1, 17003]) Final batch size: 1, sequence length: 15244 Attention mask shape: torch.Size([1, 1, 15244, 15244]) Position ids shape: torch.Size([1, 15244]) Input IDs shape: torch.Size([1, 15244]) Labels shape: torch.Size([1, 15244]) Final batch size: 1, sequence length: 15257 Attention mask shape: torch.Size([1, 1, 15257, 15257]) Position ids shape: torch.Size([1, 15257]) Input IDs shape: torch.Size([1, 15257]) Labels shape: torch.Size([1, 15257]) Final batch size: 1, sequence length: 19768 Attention mask shape: torch.Size([1, 1, 19768, 19768]) Position ids shape: torch.Size([1, 19768]) Input IDs shape: torch.Size([1, 19768]) Labels shape: torch.Size([1, 19768]) Final batch size: 1, sequence length: 16520 Attention mask shape: torch.Size([1, 1, 16520, 16520]) Position ids shape: torch.Size([1, 16520]) Input IDs shape: torch.Size([1, 16520]) Labels shape: torch.Size([1, 16520]) Final batch size: 1, sequence length: 8646 Attention mask shape: torch.Size([1, 1, 8646, 8646]) Position ids shape: torch.Size([1, 8646]) Input IDs shape: torch.Size([1, 8646]) Labels shape: torch.Size([1, 8646]) Final batch size: 1, sequence length: 20089 Attention mask shape: torch.Size([1, 1, 20089, 20089]) Position ids shape: torch.Size([1, 20089]) Input IDs shape: torch.Size([1, 20089]) Labels shape: torch.Size([1, 20089]) Final batch size: 1, sequence length: 20106 Attention mask shape: torch.Size([1, 1, 20106, 20106]) Position ids shape: torch.Size([1, 20106]) Input IDs shape: torch.Size([1, 20106]) Labels shape: torch.Size([1, 20106]) Final batch size: 1, sequence length: 19286 Attention mask shape: torch.Size([1, 1, 19286, 19286]) Position ids shape: torch.Size([1, 19286]) Input IDs shape: torch.Size([1, 19286]) Labels shape: torch.Size([1, 19286]) Final batch size: 1, sequence length: 18819 Attention mask shape: torch.Size([1, 1, 18819, 18819]) Position ids shape: torch.Size([1, 18819]) Input IDs shape: torch.Size([1, 18819]) Labels shape: torch.Size([1, 18819]) Final batch size: 1, sequence length: 21660 Attention mask shape: torch.Size([1, 1, 21660, 21660]) Position ids shape: torch.Size([1, 21660]) Input IDs shape: torch.Size([1, 21660]) Labels shape: torch.Size([1, 21660]) Final batch size: 1, sequence length: 19513 Attention mask shape: torch.Size([1, 1, 19513, 19513]) Position ids shape: torch.Size([1, 19513]) Input IDs shape: torch.Size([1, 19513]) Labels shape: torch.Size([1, 19513]) Final batch size: 1, sequence length: 22777 Attention mask shape: torch.Size([1, 1, 22777, 22777]) Position ids shape: torch.Size([1, 22777]) Input IDs shape: torch.Size([1, 22777]) Labels shape: torch.Size([1, 22777]) Final batch size: 1, sequence length: 20979 Attention mask shape: torch.Size([1, 1, 20979, 20979]) Position ids shape: torch.Size([1, 20979]) Input IDs shape: torch.Size([1, 20979]) Labels shape: torch.Size([1, 20979]) Final batch size: 1, sequence length: 18645 Attention mask shape: torch.Size([1, 1, 18645, 18645]) Position ids shape: torch.Size([1, 18645]) Input IDs shape: torch.Size([1, 18645]) Labels shape: torch.Size([1, 18645]) Final batch size: 1, sequence length: 23995 Attention mask shape: torch.Size([1, 1, 23995, 23995]) Position ids shape: torch.Size([1, 23995]) Input IDs shape: torch.Size([1, 23995]) Labels shape: torch.Size([1, 23995]) Final batch size: 1, sequence length: 23942 Attention mask shape: torch.Size([1, 1, 23942, 23942]) Position ids shape: torch.Size([1, 23942]) Input IDs shape: torch.Size([1, 23942]) Labels shape: torch.Size([1, 23942]) Final batch size: 1, sequence length: 22160 Attention mask shape: torch.Size([1, 1, 22160, 22160]) Position ids shape: torch.Size([1, 22160]) Input IDs shape: torch.Size([1, 22160]) Labels shape: torch.Size([1, 22160]) Final batch size: 1, sequence length: 23341 Attention mask shape: torch.Size([1, 1, 23341, 23341]) Position ids shape: torch.Size([1, 23341]) Input IDs shape: torch.Size([1, 23341]) Labels shape: torch.Size([1, 23341]) Final batch size: 1, sequence length: 23724 Attention mask shape: torch.Size([1, 1, 23724, 23724]) Position ids shape: torch.Size([1, 23724]) Input IDs shape: torch.Size([1, 23724]) Labels shape: torch.Size([1, 23724]) Final batch size: 1, sequence length: 5405 Attention mask shape: torch.Size([1, 1, 5405, 5405]) Position ids shape: torch.Size([1, 5405]) Input IDs shape: torch.Size([1, 5405]) Labels shape: torch.Size([1, 5405]) Final batch size: 1, sequence length: 23886 Attention mask shape: torch.Size([1, 1, 23886, 23886]) Position ids shape: torch.Size([1, 23886]) Input IDs shape: torch.Size([1, 23886]) Labels shape: torch.Size([1, 23886]) Final batch size: 1, sequence length: 14429 Attention mask shape: torch.Size([1, 1, 14429, 14429]) Position ids shape: torch.Size([1, 14429]) Input IDs shape: torch.Size([1, 14429]) Labels shape: torch.Size([1, 14429]) Final batch size: 1, sequence length: 22915 Attention mask shape: torch.Size([1, 1, 22915, 22915]) Position ids shape: torch.Size([1, 22915]) Input IDs shape: torch.Size([1, 22915]) Labels shape: torch.Size([1, 22915]) Final batch size: 1, sequence length: 18988 Attention mask shape: torch.Size([1, 1, 18988, 18988]) Position ids shape: torch.Size([1, 18988]) Input IDs shape: torch.Size([1, 18988]) Labels shape: torch.Size([1, 18988]) Final batch size: 1, sequence length: 22311 Attention mask shape: torch.Size([1, 1, 22311, 22311]) Position ids shape: torch.Size([1, 22311]) Input IDs shape: torch.Size([1, 22311]) Labels shape: torch.Size([1, 22311]) Final batch size: 1, sequence length: 22979 Attention mask shape: torch.Size([1, 1, 22979, 22979]) Position ids shape: torch.Size([1, 22979]) Input IDs shape: torch.Size([1, 22979]) Labels shape: torch.Size([1, 22979]) Final batch size: 1, sequence length: 25021 Attention mask shape: torch.Size([1, 1, 25021, 25021]) Position ids shape: torch.Size([1, 25021]) Input IDs shape: torch.Size([1, 25021]) Labels shape: torch.Size([1, 25021]) Final batch size: 1, sequence length: 5288 Attention mask shape: torch.Size([1, 1, 5288, 5288]) Position ids shape: torch.Size([1, 5288]) Input IDs shape: torch.Size([1, 5288]) Labels shape: torch.Size([1, 5288]) Final batch size: 1, sequence length: 25001 Attention mask shape: torch.Size([1, 1, 25001, 25001]) Position ids shape: torch.Size([1, 25001]) Input IDs shape: torch.Size([1, 25001]) Labels shape: torch.Size([1, 25001]) Final batch size: 1, sequence length: 24407 Attention mask shape: torch.Size([1, 1, 24407, 24407]) Position ids shape: torch.Size([1, 24407]) Input IDs shape: torch.Size([1, 24407]) Labels shape: torch.Size([1, 24407]) Final batch size: 1, sequence length: 11266 Attention mask shape: torch.Size([1, 1, 11266, 11266]) Position ids shape: torch.Size([1, 11266]) Input IDs shape: torch.Size([1, 11266]) Labels shape: torch.Size([1, 11266]) Final batch size: 1, sequence length: 26179 Attention mask shape: torch.Size([1, 1, 26179, 26179]) Position ids shape: torch.Size([1, 26179]) Input IDs shape: torch.Size([1, 26179]) Labels shape: torch.Size([1, 26179]) Final batch size: 1, sequence length: 24293 Attention mask shape: torch.Size([1, 1, 24293, 24293]) Position ids shape: torch.Size([1, 24293]) Input IDs shape: torch.Size([1, 24293]) Labels shape: torch.Size([1, 24293]) Final batch size: 1, sequence length: 24287 Attention mask shape: torch.Size([1, 1, 24287, 24287]) Position ids shape: torch.Size([1, 24287]) Input IDs shape: torch.Size([1, 24287]) Labels shape: torch.Size([1, 24287]) Final batch size: 1, sequence length: 13600 Attention mask shape: torch.Size([1, 1, 13600, 13600]) Position ids shape: torch.Size([1, 13600]) Input IDs shape: torch.Size([1, 13600]) Labels shape: torch.Size([1, 13600]) Final batch size: 1, sequence length: 17951 Attention mask shape: torch.Size([1, 1, 17951, 17951]) Position ids shape: torch.Size([1, 17951]) Input IDs shape: torch.Size([1, 17951]) Labels shape: torch.Size([1, 17951]) Final batch size: 1, sequence length: 26054 Attention mask shape: torch.Size([1, 1, 26054, 26054]) Position ids shape: torch.Size([1, 26054]) Input IDs shape: torch.Size([1, 26054]) Labels shape: torch.Size([1, 26054]) Final batch size: 1, sequence length: 26461 Attention mask shape: torch.Size([1, 1, 26461, 26461]) Position ids shape: torch.Size([1, 26461]) Input IDs shape: torch.Size([1, 26461]) Labels shape: torch.Size([1, 26461]) Final batch size: 1, sequence length: 12483 Attention mask shape: torch.Size([1, 1, 12483, 12483]) Position ids shape: torch.Size([1, 12483]) Input IDs shape: torch.Size([1, 12483]) Labels shape: torch.Size([1, 12483]) Final batch size: 1, sequence length: 20941 Attention mask shape: torch.Size([1, 1, 20941, 20941]) Position ids shape: torch.Size([1, 20941]) Input IDs shape: torch.Size([1, 20941]) Labels shape: torch.Size([1, 20941]) Final batch size: 1, sequence length: 29009 Attention mask shape: torch.Size([1, 1, 29009, 29009]) Position ids shape: torch.Size([1, 29009]) Input IDs shape: torch.Size([1, 29009]) Labels shape: torch.Size([1, 29009]) Final batch size: 1, sequence length: 30723 Attention mask shape: torch.Size([1, 1, 30723, 30723]) Position ids shape: torch.Size([1, 30723]) Input IDs shape: torch.Size([1, 30723]) Labels shape: torch.Size([1, 30723]) Final batch size: 1, sequence length: 30236 Attention mask shape: torch.Size([1, 1, 30236, 30236]) Position ids shape: torch.Size([1, 30236]) Input IDs shape: torch.Size([1, 30236]) Labels shape: torch.Size([1, 30236]) Final batch size: 1, sequence length: 28440 Attention mask shape: torch.Size([1, 1, 28440, 28440]) Position ids shape: torch.Size([1, 28440]) Input IDs shape: torch.Size([1, 28440]) Labels shape: torch.Size([1, 28440]) Final batch size: 1, sequence length: 27405 Attention mask shape: torch.Size([1, 1, 27405, 27405]) Position ids shape: torch.Size([1, 27405]) Input IDs shape: torch.Size([1, 27405]) Labels shape: torch.Size([1, 27405]) Final batch size: 1, sequence length: 32287 Attention mask shape: torch.Size([1, 1, 32287, 32287]) Position ids shape: torch.Size([1, 32287]) Input IDs shape: torch.Size([1, 32287]) Labels shape: torch.Size([1, 32287]) Final batch size: 1, sequence length: 26886 Attention mask shape: torch.Size([1, 1, 26886, 26886]) Position ids shape: torch.Size([1, 26886]) Input IDs shape: torch.Size([1, 26886]) Labels shape: torch.Size([1, 26886]) Final batch size: 1, sequence length: 28263 Attention mask shape: torch.Size([1, 1, 28263, 28263]) Position ids shape: torch.Size([1, 28263]) Input IDs shape: torch.Size([1, 28263]) Labels shape: torch.Size([1, 28263]) Final batch size: 1, sequence length: 32660 Attention mask shape: torch.Size([1, 1, 32660, 32660]) Position ids shape: torch.Size([1, 32660]) Input IDs shape: torch.Size([1, 32660]) Labels shape: torch.Size([1, 32660]) Final batch size: 1, sequence length: 15830 Attention mask shape: torch.Size([1, 1, 15830, 15830]) Position ids shape: torch.Size([1, 15830]) Input IDs shape: torch.Size([1, 15830]) Labels shape: torch.Size([1, 15830]) Final batch size: 1, sequence length: 32799 Attention mask shape: torch.Size([1, 1, 32799, 32799]) Position ids shape: torch.Size([1, 32799]) Input IDs shape: torch.Size([1, 32799]) Labels shape: torch.Size([1, 32799]) Final batch size: 1, sequence length: 17649 Attention mask shape: torch.Size([1, 1, 17649, 17649]) Position ids shape: torch.Size([1, 17649]) Input IDs shape: torch.Size([1, 17649]) Labels shape: torch.Size([1, 17649]) Final batch size: 1, sequence length: 29237 Attention mask shape: torch.Size([1, 1, 29237, 29237]) Position ids shape: torch.Size([1, 29237]) Input IDs shape: torch.Size([1, 29237]) Labels shape: torch.Size([1, 29237]) Final batch size: 1, sequence length: 17539 Attention mask shape: torch.Size([1, 1, 17539, 17539]) Position ids shape: torch.Size([1, 17539]) Input IDs shape: torch.Size([1, 17539]) Labels shape: torch.Size([1, 17539]) Final batch size: 1, sequence length: 21472 Attention mask shape: torch.Size([1, 1, 21472, 21472]) Position ids shape: torch.Size([1, 21472]) Input IDs shape: torch.Size([1, 21472]) Labels shape: torch.Size([1, 21472]) Final batch size: 1, sequence length: 32083 Attention mask shape: torch.Size([1, 1, 32083, 32083]) Position ids shape: torch.Size([1, 32083]) Input IDs shape: torch.Size([1, 32083]) Labels shape: torch.Size([1, 32083]) Final batch size: 1, sequence length: 29109 Attention mask shape: torch.Size([1, 1, 29109, 29109]) Position ids shape: torch.Size([1, 29109]) Input IDs shape: torch.Size([1, 29109]) Labels shape: torch.Size([1, 29109]) Final batch size: 1, sequence length: 29152 Attention mask shape: torch.Size([1, 1, 29152, 29152]) Position ids shape: torch.Size([1, 29152]) Input IDs shape: torch.Size([1, 29152]) Labels shape: torch.Size([1, 29152]) Final batch size: 1, sequence length: 34694 Attention mask shape: torch.Size([1, 1, 34694, 34694]) Position ids shape: torch.Size([1, 34694]) Input IDs shape: torch.Size([1, 34694]) Labels shape: torch.Size([1, 34694]) Final batch size: 1, sequence length: 29561 Attention mask shape: torch.Size([1, 1, 29561, 29561]) Position ids shape: torch.Size([1, 29561]) Input IDs shape: torch.Size([1, 29561]) Labels shape: torch.Size([1, 29561]) Final batch size: 1, sequence length: 27987 Attention mask shape: torch.Size([1, 1, 27987, 27987]) Position ids shape: torch.Size([1, 27987]) Input IDs shape: torch.Size([1, 27987]) Labels shape: torch.Size([1, 27987]) Final batch size: 1, sequence length: 31512 Attention mask shape: torch.Size([1, 1, 31512, 31512]) Position ids shape: torch.Size([1, 31512]) Input IDs shape: torch.Size([1, 31512]) Labels shape: torch.Size([1, 31512]) Final batch size: 1, sequence length: 8008 Attention mask shape: torch.Size([1, 1, 8008, 8008]) Position ids shape: torch.Size([1, 8008]) Input IDs shape: torch.Size([1, 8008]) Labels shape: torch.Size([1, 8008]) Final batch size: 1, sequence length: 26452 Attention mask shape: torch.Size([1, 1, 26452, 26452]) Position ids shape: torch.Size([1, 26452]) Input IDs shape: torch.Size([1, 26452]) Labels shape: torch.Size([1, 26452]) Final batch size: 1, sequence length: 35478 Attention mask shape: torch.Size([1, 1, 35478, 35478]) Position ids shape: torch.Size([1, 35478]) Input IDs shape: torch.Size([1, 35478]) Labels shape: torch.Size([1, 35478]) Final batch size: 1, sequence length: 17775 Attention mask shape: torch.Size([1, 1, 17775, 17775]) Position ids shape: torch.Size([1, 17775]) Input IDs shape: torch.Size([1, 17775]) Labels shape: torch.Size([1, 17775]) Final batch size: 1, sequence length: 23945 Attention mask shape: torch.Size([1, 1, 23945, 23945]) Position ids shape: torch.Size([1, 23945]) Input IDs shape: torch.Size([1, 23945]) Labels shape: torch.Size([1, 23945]) Final batch size: 1, sequence length: 15906 Attention mask shape: torch.Size([1, 1, 15906, 15906]) Position ids shape: torch.Size([1, 15906]) Input IDs shape: torch.Size([1, 15906]) Labels shape: torch.Size([1, 15906]) Final batch size: 1, sequence length: 35696 Attention mask shape: torch.Size([1, 1, 35696, 35696]) Position ids shape: torch.Size([1, 35696]) Input IDs shape: torch.Size([1, 35696]) Labels shape: torch.Size([1, 35696]) Final batch size: 1, sequence length: 7448 Attention mask shape: torch.Size([1, 1, 7448, 7448]) Position ids shape: torch.Size([1, 7448]) Input IDs shape: torch.Size([1, 7448]) Labels shape: torch.Size([1, 7448]) Final batch size: 1, sequence length: 36314 Attention mask shape: torch.Size([1, 1, 36314, 36314]) Position ids shape: torch.Size([1, 36314]) Input IDs shape: torch.Size([1, 36314]) Labels shape: torch.Size([1, 36314]) Final batch size: 1, sequence length: 24056 Attention mask shape: torch.Size([1, 1, 24056, 24056]) Position ids shape: torch.Size([1, 24056]) Input IDs shape: torch.Size([1, 24056]) Labels shape: torch.Size([1, 24056]) Final batch size: 1, sequence length: 29880 Attention mask shape: torch.Size([1, 1, 29880, 29880]) Position ids shape: torch.Size([1, 29880]) Input IDs shape: torch.Size([1, 29880]) Labels shape: torch.Size([1, 29880]) Final batch size: 1, sequence length: 39476 Attention mask shape: torch.Size([1, 1, 39476, 39476]) Position ids shape: torch.Size([1, 39476]) Input IDs shape: torch.Size([1, 39476]) Labels shape: torch.Size([1, 39476]) Final batch size: 1, sequence length: 16852 Attention mask shape: torch.Size([1, 1, 16852, 16852]) Position ids shape: torch.Size([1, 16852]) Input IDs shape: torch.Size([1, 16852]) Labels shape: torch.Size([1, 16852]) Final batch size: 1, sequence length: 29262 Attention mask shape: torch.Size([1, 1, 29262, 29262]) Position ids shape: torch.Size([1, 29262]) Input IDs shape: torch.Size([1, 29262]) Labels shape: torch.Size([1, 29262]) Final batch size: 1, sequence length: 34777 Attention mask shape: torch.Size([1, 1, 34777, 34777]) Position ids shape: torch.Size([1, 34777]) Input IDs shape: torch.Size([1, 34777]) Labels shape: torch.Size([1, 34777]) Final batch size: 1, sequence length: 16631 Attention mask shape: torch.Size([1, 1, 16631, 16631]) Position ids shape: torch.Size([1, 16631]) Input IDs shape: torch.Size([1, 16631]) Labels shape: torch.Size([1, 16631]) Final batch size: 1, sequence length: 31786 Attention mask shape: torch.Size([1, 1, 31786, 31786]) Position ids shape: torch.Size([1, 31786]) Input IDs shape: torch.Size([1, 31786]) Labels shape: torch.Size([1, 31786]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 37933 Attention mask shape: torch.Size([1, 1, 37933, 37933]) Position ids shape: torch.Size([1, 37933]) Input IDs shape: torch.Size([1, 37933]) Labels shape: torch.Size([1, 37933]) Final batch size: 1, sequence length: 28879 Attention mask shape: torch.Size([1, 1, 28879, 28879]) Position ids shape: torch.Size([1, 28879]) Input IDs shape: torch.Size([1, 28879]) Labels shape: torch.Size([1, 28879]) Final batch size: 1, sequence length: 10480 Attention mask shape: torch.Size([1, 1, 10480, 10480]) Position ids shape: torch.Size([1, 10480]) Input IDs shape: torch.Size([1, 10480]) Labels shape: torch.Size([1, 10480]) Final batch size: 1, sequence length: 28691 Attention mask shape: torch.Size([1, 1, 28691, 28691]) Position ids shape: torch.Size([1, 28691]) Input IDs shape: torch.Size([1, 28691]) Labels shape: torch.Size([1, 28691]) Final batch size: 1, sequence length: 35526 Attention mask shape: torch.Size([1, 1, 35526, 35526]) Position ids shape: torch.Size([1, 35526]) Input IDs shape: torch.Size([1, 35526]) Labels shape: torch.Size([1, 35526]) Final batch size: 1, sequence length: 11395 Attention mask shape: torch.Size([1, 1, 11395, 11395]) Position ids shape: torch.Size([1, 11395]) Input IDs shape: torch.Size([1, 11395]) Labels shape: torch.Size([1, 11395]) Final batch size: 1, sequence length: 35999 Attention mask shape: torch.Size([1, 1, 35999, 35999]) Position ids shape: torch.Size([1, 35999]) Input IDs shape: torch.Size([1, 35999]) Labels shape: torch.Size([1, 35999]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 31132 Attention mask shape: torch.Size([1, 1, 31132, 31132]) Position ids shape: torch.Size([1, 31132]) Input IDs shape: torch.Size([1, 31132]) Labels shape: torch.Size([1, 31132]) Final batch size: 1, sequence length: 28495 Attention mask shape: torch.Size([1, 1, 28495, 28495]) Position ids shape: torch.Size([1, 28495]) Input IDs shape: torch.Size([1, 28495]) Labels shape: torch.Size([1, 28495]) Final batch size: 1, sequence length: 24824 Attention mask shape: torch.Size([1, 1, 24824, 24824]) Position ids shape: torch.Size([1, 24824]) Input IDs shape: torch.Size([1, 24824]) Labels shape: torch.Size([1, 24824]) Final batch size: 1, sequence length: 20176 Attention mask shape: torch.Size([1, 1, 20176, 20176]) Position ids shape: torch.Size([1, 20176]) Input IDs shape: torch.Size([1, 20176]) Labels shape: torch.Size([1, 20176]) Final batch size: 1, sequence length: 20694 Attention mask shape: torch.Size([1, 1, 20694, 20694]) Position ids shape: torch.Size([1, 20694]) Input IDs shape: torch.Size([1, 20694]) Labels shape: torch.Size([1, 20694]) Final batch size: 1, sequence length: 18127 Attention mask shape: torch.Size([1, 1, 18127, 18127]) Position ids shape: torch.Size([1, 18127]) Input IDs shape: torch.Size([1, 18127]) Labels shape: torch.Size([1, 18127]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 6378 Attention mask shape: torch.Size([1, 1, 6378, 6378]) Position ids shape: torch.Size([1, 6378]) Input IDs shape: torch.Size([1, 6378]) Labels shape: torch.Size([1, 6378]) Final batch size: 1, sequence length: 38935 Attention mask shape: torch.Size([1, 1, 38935, 38935]) Position ids shape: torch.Size([1, 38935]) Input IDs shape: torch.Size([1, 38935]) Labels shape: torch.Size([1, 38935]) Final batch size: 1, sequence length: 18911 Attention mask shape: torch.Size([1, 1, 18911, 18911]) Position ids shape: torch.Size([1, 18911]) Input IDs shape: torch.Size([1, 18911]) Labels shape: torch.Size([1, 18911]) Final batch size: 1, sequence length: 14869 Attention mask shape: torch.Size([1, 1, 14869, 14869]) Position ids shape: torch.Size([1, 14869]) Input IDs shape: torch.Size([1, 14869]) Labels shape: torch.Size([1, 14869]) Final batch size: 1, sequence length: 40605 Attention mask shape: torch.Size([1, 1, 40605, 40605]) Position ids shape: torch.Size([1, 40605]) Input IDs shape: torch.Size([1, 40605]) Labels shape: torch.Size([1, 40605]) Final batch size: 1, sequence length: 36469 Attention mask shape: torch.Size([1, 1, 36469, 36469]) Position ids shape: torch.Size([1, 36469]) Input IDs shape: torch.Size([1, 36469]) Labels shape: torch.Size([1, 36469]) Final batch size: 1, sequence length: 35116 Attention mask shape: torch.Size([1, 1, 35116, 35116]) Position ids shape: torch.Size([1, 35116]) Input IDs shape: torch.Size([1, 35116]) Labels shape: torch.Size([1, 35116]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 28313 Attention mask shape: torch.Size([1, 1, 28313, 28313]) Position ids shape: torch.Size([1, 28313]) Input IDs shape: torch.Size([1, 28313]) Labels shape: torch.Size([1, 28313]) Final batch size: 1, sequence length: 39324 Attention mask shape: torch.Size([1, 1, 39324, 39324]) Position ids shape: torch.Size([1, 39324]) Input IDs shape: torch.Size([1, 39324]) Labels shape: torch.Size([1, 39324]) Final batch size: 1, sequence length: 39687 Attention mask shape: torch.Size([1, 1, 39687, 39687]) Position ids shape: torch.Size([1, 39687]) Input IDs shape: torch.Size([1, 39687]) Labels shape: torch.Size([1, 39687]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 30024 Attention mask shape: torch.Size([1, 1, 30024, 30024]) Position ids shape: torch.Size([1, 30024]) Input IDs shape: torch.Size([1, 30024]) Labels shape: torch.Size([1, 30024]) Final batch size: 1, sequence length: 37159 Attention mask shape: torch.Size([1, 1, 37159, 37159]) Position ids shape: torch.Size([1, 37159]) Input IDs shape: torch.Size([1, 37159]) Labels shape: torch.Size([1, 37159]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 33070 Attention mask shape: torch.Size([1, 1, 33070, 33070]) Position ids shape: torch.Size([1, 33070]) Input IDs shape: torch.Size([1, 33070]) Labels shape: torch.Size([1, 33070]) Final batch size: 1, sequence length: 26402 Attention mask shape: torch.Size([1, 1, 26402, 26402]) Position ids shape: torch.Size([1, 26402]) Input IDs shape: torch.Size([1, 26402]) Labels shape: torch.Size([1, 26402]) Final batch size: 1, sequence length: 30245 Attention mask shape: torch.Size([1, 1, 30245, 30245]) Position ids shape: torch.Size([1, 30245]) Input IDs shape: torch.Size([1, 30245]) Labels shape: torch.Size([1, 30245]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) {'loss': 0.2628, 'grad_norm': 0.32566194623977623, 'learning_rate': 6.545084971874738e-06, 'num_tokens': -inf, 'epoch': 3.62} Final batch size: 1, sequence length: 3968 Attention mask shape: torch.Size([1, 1, 3968, 3968]) Position ids shape: torch.Size([1, 3968]) Input IDs shape: torch.Size([1, 3968]) Labels shape: torch.Size([1, 3968]) Final batch size: 1, sequence length: 6567 Attention mask shape: torch.Size([1, 1, 6567, 6567]) Position ids shape: torch.Size([1, 6567]) Input IDs shape: torch.Size([1, 6567]) Labels shape: torch.Size([1, 6567]) Final batch size: 1, sequence length: 8347 Attention mask shape: torch.Size([1, 1, 8347, 8347]) Position ids shape: torch.Size([1, 8347]) Input IDs shape: torch.Size([1, 8347]) Labels shape: torch.Size([1, 8347]) Final batch size: 1, sequence length: 5997 Attention mask shape: torch.Size([1, 1, 5997, 5997]) Position ids shape: torch.Size([1, 5997]) Input IDs shape: torch.Size([1, 5997]) Labels shape: torch.Size([1, 5997]) Final batch size: 1, sequence length: 11752 Attention mask shape: torch.Size([1, 1, 11752, 11752]) Position ids shape: torch.Size([1, 11752]) Input IDs shape: torch.Size([1, 11752]) Labels shape: torch.Size([1, 11752]) Final batch size: 1, sequence length: 11054 Attention mask shape: torch.Size([1, 1, 11054, 11054]) Position ids shape: torch.Size([1, 11054]) Input IDs shape: torch.Size([1, 11054]) Labels shape: torch.Size([1, 11054]) Final batch size: 1, sequence length: 12967 Attention mask shape: torch.Size([1, 1, 12967, 12967]) Position ids shape: torch.Size([1, 12967]) Input IDs shape: torch.Size([1, 12967]) Labels shape: torch.Size([1, 12967]) Final batch size: 1, sequence length: 11232 Attention mask shape: torch.Size([1, 1, 11232, 11232]) Position ids shape: torch.Size([1, 11232]) Input IDs shape: torch.Size([1, 11232]) Labels shape: torch.Size([1, 11232]) Final batch size: 1, sequence length: 9027 Attention mask shape: torch.Size([1, 1, 9027, 9027]) Position ids shape: torch.Size([1, 9027]) Input IDs shape: torch.Size([1, 9027]) Labels shape: torch.Size([1, 9027]) Final batch size: 1, sequence length: 13260 Attention mask shape: torch.Size([1, 1, 13260, 13260]) Position ids shape: torch.Size([1, 13260]) Input IDs shape: torch.Size([1, 13260]) Labels shape: torch.Size([1, 13260]) Final batch size: 1, sequence length: 11880 Attention mask shape: torch.Size([1, 1, 11880, 11880]) Position ids shape: torch.Size([1, 11880]) Input IDs shape: torch.Size([1, 11880]) Labels shape: torch.Size([1, 11880]) Final batch size: 1, sequence length: 12370 Attention mask shape: torch.Size([1, 1, 12370, 12370]) Position ids shape: torch.Size([1, 12370]) Input IDs shape: torch.Size([1, 12370]) Labels shape: torch.Size([1, 12370]) Final batch size: 1, sequence length: 14512 Attention mask shape: torch.Size([1, 1, 14512, 14512]) Position ids shape: torch.Size([1, 14512]) Input IDs shape: torch.Size([1, 14512]) Labels shape: torch.Size([1, 14512]) Final batch size: 1, sequence length: 14758 Attention mask shape: torch.Size([1, 1, 14758, 14758]) Position ids shape: torch.Size([1, 14758]) Input IDs shape: torch.Size([1, 14758]) Labels shape: torch.Size([1, 14758]) Final batch size: 1, sequence length: 14597 Attention mask shape: torch.Size([1, 1, 14597, 14597]) Position ids shape: torch.Size([1, 14597]) Input IDs shape: torch.Size([1, 14597]) Labels shape: torch.Size([1, 14597]) Final batch size: 1, sequence length: 17166 Attention mask shape: torch.Size([1, 1, 17166, 17166]) Position ids shape: torch.Size([1, 17166]) Input IDs shape: torch.Size([1, 17166]) Labels shape: torch.Size([1, 17166]) Final batch size: 1, sequence length: 15438 Attention mask shape: torch.Size([1, 1, 15438, 15438]) Position ids shape: torch.Size([1, 15438]) Input IDs shape: torch.Size([1, 15438]) Labels shape: torch.Size([1, 15438]) Final batch size: 1, sequence length: 18050 Attention mask shape: torch.Size([1, 1, 18050, 18050]) Position ids shape: torch.Size([1, 18050]) Input IDs shape: torch.Size([1, 18050]) Labels shape: torch.Size([1, 18050]) Final batch size: 1, sequence length: 17839 Attention mask shape: torch.Size([1, 1, 17839, 17839]) Position ids shape: torch.Size([1, 17839]) Input IDs shape: torch.Size([1, 17839]) Labels shape: torch.Size([1, 17839]) Final batch size: 1, sequence length: 16014 Attention mask shape: torch.Size([1, 1, 16014, 16014]) Position ids shape: torch.Size([1, 16014]) Input IDs shape: torch.Size([1, 16014]) Labels shape: torch.Size([1, 16014]) Final batch size: 1, sequence length: 18264 Attention mask shape: torch.Size([1, 1, 18264, 18264]) Position ids shape: torch.Size([1, 18264]) Input IDs shape: torch.Size([1, 18264]) Labels shape: torch.Size([1, 18264]) Final batch size: 1, sequence length: 17092 Attention mask shape: torch.Size([1, 1, 17092, 17092]) Position ids shape: torch.Size([1, 17092]) Input IDs shape: torch.Size([1, 17092]) Labels shape: torch.Size([1, 17092]) Final batch size: 1, sequence length: 11206 Attention mask shape: torch.Size([1, 1, 11206, 11206]) Position ids shape: torch.Size([1, 11206]) Input IDs shape: torch.Size([1, 11206]) Labels shape: torch.Size([1, 11206]) Final batch size: 1, sequence length: 14520 Attention mask shape: torch.Size([1, 1, 14520, 14520]) Position ids shape: torch.Size([1, 14520]) Input IDs shape: torch.Size([1, 14520]) Labels shape: torch.Size([1, 14520]) Final batch size: 1, sequence length: 17194 Attention mask shape: torch.Size([1, 1, 17194, 17194]) Position ids shape: torch.Size([1, 17194]) Input IDs shape: torch.Size([1, 17194]) Labels shape: torch.Size([1, 17194]) Final batch size: 1, sequence length: 18131 Attention mask shape: torch.Size([1, 1, 18131, 18131]) Position ids shape: torch.Size([1, 18131]) Input IDs shape: torch.Size([1, 18131]) Labels shape: torch.Size([1, 18131]) Final batch size: 1, sequence length: 20487 Attention mask shape: torch.Size([1, 1, 20487, 20487]) Position ids shape: torch.Size([1, 20487]) Input IDs shape: torch.Size([1, 20487]) Labels shape: torch.Size([1, 20487]) Final batch size: 1, sequence length: 14335 Attention mask shape: torch.Size([1, 1, 14335, 14335]) Position ids shape: torch.Size([1, 14335]) Input IDs shape: torch.Size([1, 14335]) Labels shape: torch.Size([1, 14335]) Final batch size: 1, sequence length: 18950 Attention mask shape: torch.Size([1, 1, 18950, 18950]) Position ids shape: torch.Size([1, 18950]) Input IDs shape: torch.Size([1, 18950]) Labels shape: torch.Size([1, 18950]) Final batch size: 1, sequence length: 13061 Attention mask shape: torch.Size([1, 1, 13061, 13061]) Position ids shape: torch.Size([1, 13061]) Input IDs shape: torch.Size([1, 13061]) Labels shape: torch.Size([1, 13061]) Final batch size: 1, sequence length: 24432 Attention mask shape: torch.Size([1, 1, 24432, 24432]) Position ids shape: torch.Size([1, 24432]) Input IDs shape: torch.Size([1, 24432]) Labels shape: torch.Size([1, 24432]) Final batch size: 1, sequence length: 22107 Attention mask shape: torch.Size([1, 1, 22107, 22107]) Position ids shape: torch.Size([1, 22107]) Input IDs shape: torch.Size([1, 22107]) Labels shape: torch.Size([1, 22107]) Final batch size: 1, sequence length: 23324 Attention mask shape: torch.Size([1, 1, 23324, 23324]) Position ids shape: torch.Size([1, 23324]) Input IDs shape: torch.Size([1, 23324]) Labels shape: torch.Size([1, 23324]) Final batch size: 1, sequence length: 24566 Attention mask shape: torch.Size([1, 1, 24566, 24566]) Position ids shape: torch.Size([1, 24566]) Input IDs shape: torch.Size([1, 24566]) Labels shape: torch.Size([1, 24566]) Final batch size: 1, sequence length: 21137 Attention mask shape: torch.Size([1, 1, 21137, 21137]) Position ids shape: torch.Size([1, 21137]) Input IDs shape: torch.Size([1, 21137]) Labels shape: torch.Size([1, 21137]) Final batch size: 1, sequence length: 26068 Attention mask shape: torch.Size([1, 1, 26068, 26068]) Position ids shape: torch.Size([1, 26068]) Input IDs shape: torch.Size([1, 26068]) Labels shape: torch.Size([1, 26068]) Final batch size: 1, sequence length: 20947 Attention mask shape: torch.Size([1, 1, 20947, 20947]) Position ids shape: torch.Size([1, 20947]) Input IDs shape: torch.Size([1, 20947]) Labels shape: torch.Size([1, 20947]) Final batch size: 1, sequence length: 23960 Attention mask shape: torch.Size([1, 1, 23960, 23960]) Position ids shape: torch.Size([1, 23960]) Input IDs shape: torch.Size([1, 23960]) Labels shape: torch.Size([1, 23960]) Final batch size: 1, sequence length: 22547 Attention mask shape: torch.Size([1, 1, 22547, 22547]) Position ids shape: torch.Size([1, 22547]) Input IDs shape: torch.Size([1, 22547]) Labels shape: torch.Size([1, 22547]) Final batch size: 1, sequence length: 17115 Attention mask shape: torch.Size([1, 1, 17115, 17115]) Position ids shape: torch.Size([1, 17115]) Input IDs shape: torch.Size([1, 17115]) Labels shape: torch.Size([1, 17115]) Final batch size: 1, sequence length: 18051 Attention mask shape: torch.Size([1, 1, 18051, 18051]) Position ids shape: torch.Size([1, 18051]) Input IDs shape: torch.Size([1, 18051]) Labels shape: torch.Size([1, 18051]) Final batch size: 1, sequence length: 29236 Attention mask shape: torch.Size([1, 1, 29236, 29236]) Position ids shape: torch.Size([1, 29236]) Input IDs shape: torch.Size([1, 29236]) Labels shape: torch.Size([1, 29236]) Final batch size: 1, sequence length: 28858 Attention mask shape: torch.Size([1, 1, 28858, 28858]) Position ids shape: torch.Size([1, 28858]) Input IDs shape: torch.Size([1, 28858]) Labels shape: torch.Size([1, 28858]) Final batch size: 1, sequence length: 28284 Final batch size: 1, sequence length: 15229 Attention mask shape: torch.Size([1, 1, 28284, 28284]) Position ids shape: torch.Size([1, 28284]) Input IDs shape: torch.Size([1, 28284]) Labels shape: torch.Size([1, 28284]) Attention mask shape: torch.Size([1, 1, 15229, 15229]) Position ids shape: torch.Size([1, 15229]) Input IDs shape: torch.Size([1, 15229]) Labels shape: torch.Size([1, 15229]) Final batch size: 1, sequence length: 26271 Attention mask shape: torch.Size([1, 1, 26271, 26271]) Position ids shape: torch.Size([1, 26271]) Input IDs shape: torch.Size([1, 26271]) Labels shape: torch.Size([1, 26271]) Final batch size: 1, sequence length: 29768 Attention mask shape: torch.Size([1, 1, 29768, 29768]) Position ids shape: torch.Size([1, 29768]) Input IDs shape: torch.Size([1, 29768]) Labels shape: torch.Size([1, 29768]) Final batch size: 1, sequence length: 21728 Attention mask shape: torch.Size([1, 1, 21728, 21728]) Position ids shape: torch.Size([1, 21728]) Input IDs shape: torch.Size([1, 21728]) Labels shape: torch.Size([1, 21728]) Final batch size: 1, sequence length: 29592 Attention mask shape: torch.Size([1, 1, 29592, 29592]) Position ids shape: torch.Size([1, 29592]) Input IDs shape: torch.Size([1, 29592]) Labels shape: torch.Size([1, 29592]) Final batch size: 1, sequence length: 10286 Attention mask shape: torch.Size([1, 1, 10286, 10286]) Position ids shape: torch.Size([1, 10286]) Input IDs shape: torch.Size([1, 10286]) Labels shape: torch.Size([1, 10286]) Final batch size: 1, sequence length: 29478 Attention mask shape: torch.Size([1, 1, 29478, 29478]) Position ids shape: torch.Size([1, 29478]) Input IDs shape: torch.Size([1, 29478]) Labels shape: torch.Size([1, 29478]) Final batch size: 1, sequence length: 26333 Attention mask shape: torch.Size([1, 1, 26333, 26333]) Position ids shape: torch.Size([1, 26333]) Input IDs shape: torch.Size([1, 26333]) Labels shape: torch.Size([1, 26333]) Final batch size: 1, sequence length: 25252 Attention mask shape: torch.Size([1, 1, 25252, 25252]) Position ids shape: torch.Size([1, 25252]) Input IDs shape: torch.Size([1, 25252]) Labels shape: torch.Size([1, 25252]) Final batch size: 1, sequence length: 27278 Attention mask shape: torch.Size([1, 1, 27278, 27278]) Position ids shape: torch.Size([1, 27278]) Input IDs shape: torch.Size([1, 27278]) Labels shape: torch.Size([1, 27278]) Final batch size: 1, sequence length: 27293 Attention mask shape: torch.Size([1, 1, 27293, 27293]) Position ids shape: torch.Size([1, 27293]) Input IDs shape: torch.Size([1, 27293]) Labels shape: torch.Size([1, 27293]) Final batch size: 1, sequence length: 31860 Attention mask shape: torch.Size([1, 1, 31860, 31860]) Position ids shape: torch.Size([1, 31860]) Input IDs shape: torch.Size([1, 31860]) Labels shape: torch.Size([1, 31860]) Final batch size: 1, sequence length: 30862 Attention mask shape: torch.Size([1, 1, 30862, 30862]) Position ids shape: torch.Size([1, 30862]) Input IDs shape: torch.Size([1, 30862]) Labels shape: torch.Size([1, 30862]) Final batch size: 1, sequence length: 26937 Attention mask shape: torch.Size([1, 1, 26937, 26937]) Position ids shape: torch.Size([1, 26937]) Input IDs shape: torch.Size([1, 26937]) Labels shape: torch.Size([1, 26937]) Final batch size: 1, sequence length: 28421 Attention mask shape: torch.Size([1, 1, 28421, 28421]) Position ids shape: torch.Size([1, 28421]) Input IDs shape: torch.Size([1, 28421]) Labels shape: torch.Size([1, 28421]) Final batch size: 1, sequence length: 32392 Attention mask shape: torch.Size([1, 1, 32392, 32392]) Position ids shape: torch.Size([1, 32392]) Input IDs shape: torch.Size([1, 32392]) Labels shape: torch.Size([1, 32392]) Final batch size: 1, sequence length: 32112 Attention mask shape: torch.Size([1, 1, 32112, 32112]) Position ids shape: torch.Size([1, 32112]) Input IDs shape: torch.Size([1, 32112]) Labels shape: torch.Size([1, 32112]) Final batch size: 1, sequence length: 33736 Attention mask shape: torch.Size([1, 1, 33736, 33736]) Position ids shape: torch.Size([1, 33736]) Input IDs shape: torch.Size([1, 33736]) Labels shape: torch.Size([1, 33736]) Final batch size: 1, sequence length: 31377 Attention mask shape: torch.Size([1, 1, 31377, 31377]) Position ids shape: torch.Size([1, 31377]) Input IDs shape: torch.Size([1, 31377]) Labels shape: torch.Size([1, 31377]) Final batch size: 1, sequence length: 13257 Attention mask shape: torch.Size([1, 1, 13257, 13257]) Position ids shape: torch.Size([1, 13257]) Input IDs shape: torch.Size([1, 13257]) Labels shape: torch.Size([1, 13257]) Final batch size: 1, sequence length: 18408 Attention mask shape: torch.Size([1, 1, 18408, 18408]) Position ids shape: torch.Size([1, 18408]) Input IDs shape: torch.Size([1, 18408]) Labels shape: torch.Size([1, 18408]) Final batch size: 1, sequence length: 7364 Attention mask shape: torch.Size([1, 1, 7364, 7364]) Position ids shape: torch.Size([1, 7364]) Input IDs shape: torch.Size([1, 7364]) Labels shape: torch.Size([1, 7364]) Final batch size: 1, sequence length: 19888 Attention mask shape: torch.Size([1, 1, 19888, 19888]) Position ids shape: torch.Size([1, 19888]) Input IDs shape: torch.Size([1, 19888]) Labels shape: torch.Size([1, 19888]) Final batch size: 1, sequence length: 31740 Attention mask shape: torch.Size([1, 1, 31740, 31740]) Position ids shape: torch.Size([1, 31740]) Input IDs shape: torch.Size([1, 31740]) Labels shape: torch.Size([1, 31740]) Final batch size: 1, sequence length: 33097 Attention mask shape: torch.Size([1, 1, 33097, 33097]) Position ids shape: torch.Size([1, 33097]) Input IDs shape: torch.Size([1, 33097]) Labels shape: torch.Size([1, 33097]) Final batch size: 1, sequence length: 29489 Attention mask shape: torch.Size([1, 1, 29489, 29489]) Position ids shape: torch.Size([1, 29489]) Input IDs shape: torch.Size([1, 29489]) Labels shape: torch.Size([1, 29489]) Final batch size: 1, sequence length: 25484 Attention mask shape: torch.Size([1, 1, 25484, 25484]) Position ids shape: torch.Size([1, 25484]) Input IDs shape: torch.Size([1, 25484]) Labels shape: torch.Size([1, 25484]) Final batch size: 1, sequence length: 9974 Attention mask shape: torch.Size([1, 1, 9974, 9974]) Position ids shape: torch.Size([1, 9974]) Input IDs shape: torch.Size([1, 9974]) Labels shape: torch.Size([1, 9974]) Final batch size: 1, sequence length: 32181 Attention mask shape: torch.Size([1, 1, 32181, 32181]) Position ids shape: torch.Size([1, 32181]) Input IDs shape: torch.Size([1, 32181]) Labels shape: torch.Size([1, 32181]) Final batch size: 1, sequence length: 32162 Attention mask shape: torch.Size([1, 1, 32162, 32162]) Position ids shape: torch.Size([1, 32162]) Input IDs shape: torch.Size([1, 32162]) Labels shape: torch.Size([1, 32162]) Final batch size: 1, sequence length: 35596 Attention mask shape: torch.Size([1, 1, 35596, 35596]) Position ids shape: torch.Size([1, 35596]) Input IDs shape: torch.Size([1, 35596]) Labels shape: torch.Size([1, 35596]) Final batch size: 1, sequence length: 37623 Attention mask shape: torch.Size([1, 1, 37623, 37623]) Position ids shape: torch.Size([1, 37623]) Input IDs shape: torch.Size([1, 37623]) Labels shape: torch.Size([1, 37623]) Final batch size: 1, sequence length: 35069 Attention mask shape: torch.Size([1, 1, 35069, 35069]) Position ids shape: torch.Size([1, 35069]) Input IDs shape: torch.Size([1, 35069]) Labels shape: torch.Size([1, 35069]) Final batch size: 1, sequence length: 34514 Attention mask shape: torch.Size([1, 1, 34514, 34514]) Position ids shape: torch.Size([1, 34514]) Input IDs shape: torch.Size([1, 34514]) Labels shape: torch.Size([1, 34514]) Final batch size: 1, sequence length: 35171 Attention mask shape: torch.Size([1, 1, 35171, 35171]) Position ids shape: torch.Size([1, 35171]) Input IDs shape: torch.Size([1, 35171]) Labels shape: torch.Size([1, 35171]) Final batch size: 1, sequence length: 18711 Attention mask shape: torch.Size([1, 1, 18711, 18711]) Position ids shape: torch.Size([1, 18711]) Input IDs shape: torch.Size([1, 18711]) Labels shape: torch.Size([1, 18711]) Final batch size: 1, sequence length: 39754 Attention mask shape: torch.Size([1, 1, 39754, 39754]) Position ids shape: torch.Size([1, 39754]) Input IDs shape: torch.Size([1, 39754]) Labels shape: torch.Size([1, 39754]) Final batch size: 1, sequence length: 21500 Attention mask shape: torch.Size([1, 1, 21500, 21500]) Position ids shape: torch.Size([1, 21500]) Input IDs shape: torch.Size([1, 21500]) Labels shape: torch.Size([1, 21500]) Final batch size: 1, sequence length: 30302 Attention mask shape: torch.Size([1, 1, 30302, 30302]) Position ids shape: torch.Size([1, 30302]) Input IDs shape: torch.Size([1, 30302]) Labels shape: torch.Size([1, 30302]) Final batch size: 1, sequence length: 17777 Attention mask shape: torch.Size([1, 1, 17777, 17777]) Position ids shape: torch.Size([1, 17777]) Input IDs shape: torch.Size([1, 17777]) Labels shape: torch.Size([1, 17777]) Final batch size: 1, sequence length: 24061 Attention mask shape: torch.Size([1, 1, 24061, 24061]) Position ids shape: torch.Size([1, 24061]) Input IDs shape: torch.Size([1, 24061]) Labels shape: torch.Size([1, 24061]) Final batch size: 1, sequence length: 32979 Attention mask shape: torch.Size([1, 1, 32979, 32979]) Position ids shape: torch.Size([1, 32979]) Input IDs shape: torch.Size([1, 32979]) Labels shape: torch.Size([1, 32979]) Final batch size: 1, sequence length: 25386 Attention mask shape: torch.Size([1, 1, 25386, 25386]) Position ids shape: torch.Size([1, 25386]) Input IDs shape: torch.Size([1, 25386]) Labels shape: torch.Size([1, 25386]) Final batch size: 1, sequence length: 13557 Attention mask shape: torch.Size([1, 1, 13557, 13557]) Position ids shape: torch.Size([1, 13557]) Input IDs shape: torch.Size([1, 13557]) Labels shape: torch.Size([1, 13557]) Final batch size: 1, sequence length: 31679 Attention mask shape: torch.Size([1, 1, 31679, 31679]) Position ids shape: torch.Size([1, 31679]) Input IDs shape: torch.Size([1, 31679]) Labels shape: torch.Size([1, 31679]) Final batch size: 1, sequence length: 38351 Attention mask shape: torch.Size([1, 1, 38351, 38351]) Position ids shape: torch.Size([1, 38351]) Input IDs shape: torch.Size([1, 38351]) Labels shape: torch.Size([1, 38351]) Final batch size: 1, sequence length: 30051 Attention mask shape: torch.Size([1, 1, 30051, 30051]) Position ids shape: torch.Size([1, 30051]) Input IDs shape: torch.Size([1, 30051]) Labels shape: torch.Size([1, 30051]) Final batch size: 1, sequence length: 38790 Attention mask shape: torch.Size([1, 1, 38790, 38790]) Position ids shape: torch.Size([1, 38790]) Input IDs shape: torch.Size([1, 38790]) Labels shape: torch.Size([1, 38790]) Final batch size: 1, sequence length: 34170 Attention mask shape: torch.Size([1, 1, 34170, 34170]) Position ids shape: torch.Size([1, 34170]) Input IDs shape: torch.Size([1, 34170]) Labels shape: torch.Size([1, 34170]) Final batch size: 1, sequence length: 21408 Attention mask shape: torch.Size([1, 1, 21408, 21408]) Position ids shape: torch.Size([1, 21408]) Input IDs shape: torch.Size([1, 21408]) Labels shape: torch.Size([1, 21408]) Final batch size: 1, sequence length: 30639 Attention mask shape: torch.Size([1, 1, 30639, 30639]) Position ids shape: torch.Size([1, 30639]) Input IDs shape: torch.Size([1, 30639]) Labels shape: torch.Size([1, 30639]) Final batch size: 1, sequence length: 36535 Attention mask shape: torch.Size([1, 1, 36535, 36535]) Position ids shape: torch.Size([1, 36535]) Input IDs shape: torch.Size([1, 36535]) Labels shape: torch.Size([1, 36535]) Final batch size: 1, sequence length: 24943 Attention mask shape: torch.Size([1, 1, 24943, 24943]) Position ids shape: torch.Size([1, 24943]) Input IDs shape: torch.Size([1, 24943]) Labels shape: torch.Size([1, 24943]) Final batch size: 1, sequence length: 34871 Attention mask shape: torch.Size([1, 1, 34871, 34871]) Position ids shape: torch.Size([1, 34871]) Input IDs shape: torch.Size([1, 34871]) Labels shape: torch.Size([1, 34871]) Final batch size: 1, sequence length: 32327 Attention mask shape: torch.Size([1, 1, 32327, 32327]) Position ids shape: torch.Size([1, 32327]) Input IDs shape: torch.Size([1, 32327]) Labels shape: torch.Size([1, 32327]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 15697 Attention mask shape: torch.Size([1, 1, 15697, 15697]) Position ids shape: torch.Size([1, 15697]) Input IDs shape: torch.Size([1, 15697]) Labels shape: torch.Size([1, 15697]) Final batch size: 1, sequence length: 21825 Attention mask shape: torch.Size([1, 1, 21825, 21825]) Position ids shape: torch.Size([1, 21825]) Input IDs shape: torch.Size([1, 21825]) Labels shape: torch.Size([1, 21825]) Final batch size: 1, sequence length: 33883 Attention mask shape: torch.Size([1, 1, 33883, 33883]) Position ids shape: torch.Size([1, 33883]) Input IDs shape: torch.Size([1, 33883]) Labels shape: torch.Size([1, 33883]) Final batch size: 1, sequence length: 33807 Attention mask shape: torch.Size([1, 1, 33807, 33807]) Position ids shape: torch.Size([1, 33807]) Input IDs shape: torch.Size([1, 33807]) Labels shape: torch.Size([1, 33807]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 24840 Attention mask shape: torch.Size([1, 1, 24840, 24840]) Position ids shape: torch.Size([1, 24840]) Input IDs shape: torch.Size([1, 24840]) Labels shape: torch.Size([1, 24840]) Final batch size: 1, sequence length: 23243 Attention mask shape: torch.Size([1, 1, 23243, 23243]) Position ids shape: torch.Size([1, 23243]) Input IDs shape: torch.Size([1, 23243]) Labels shape: torch.Size([1, 23243]) Final batch size: 1, sequence length: 18106 Attention mask shape: torch.Size([1, 1, 18106, 18106]) Position ids shape: torch.Size([1, 18106]) Input IDs shape: torch.Size([1, 18106]) Labels shape: torch.Size([1, 18106]) Final batch size: 1, sequence length: 24910 Attention mask shape: torch.Size([1, 1, 24910, 24910]) Position ids shape: torch.Size([1, 24910]) Input IDs shape: torch.Size([1, 24910]) Labels shape: torch.Size([1, 24910]) Final batch size: 1, sequence length: 30361 Attention mask shape: torch.Size([1, 1, 30361, 30361]) Position ids shape: torch.Size([1, 30361]) Input IDs shape: torch.Size([1, 30361]) Labels shape: torch.Size([1, 30361]) Final batch size: 1, sequence length: 31513 Attention mask shape: torch.Size([1, 1, 31513, 31513]) Position ids shape: torch.Size([1, 31513]) Input IDs shape: torch.Size([1, 31513]) Labels shape: torch.Size([1, 31513]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 39483 Attention mask shape: torch.Size([1, 1, 39483, 39483]) Position ids shape: torch.Size([1, 39483]) Input IDs shape: torch.Size([1, 39483]) Labels shape: torch.Size([1, 39483]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 29691 Attention mask shape: torch.Size([1, 1, 29691, 29691]) Position ids shape: torch.Size([1, 29691]) Input IDs shape: torch.Size([1, 29691]) Labels shape: torch.Size([1, 29691]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 27228 Attention mask shape: torch.Size([1, 1, 27228, 27228]) Position ids shape: torch.Size([1, 27228]) Input IDs shape: torch.Size([1, 27228]) Labels shape: torch.Size([1, 27228]) Final batch size: 1, sequence length: 32351 Attention mask shape: torch.Size([1, 1, 32351, 32351]) Position ids shape: torch.Size([1, 32351]) Input IDs shape: torch.Size([1, 32351]) Labels shape: torch.Size([1, 32351]) Final batch size: 1, sequence length: 33549 Attention mask shape: torch.Size([1, 1, 33549, 33549]) Position ids shape: torch.Size([1, 33549]) Input IDs shape: torch.Size([1, 33549]) Labels shape: torch.Size([1, 33549]) Final batch size: 1, sequence length: 15180 Attention mask shape: torch.Size([1, 1, 15180, 15180]) Position ids shape: torch.Size([1, 15180]) Input IDs shape: torch.Size([1, 15180]) Labels shape: torch.Size([1, 15180]) Final batch size: 1, sequence length: 13064 Attention mask shape: torch.Size([1, 1, 13064, 13064]) Position ids shape: torch.Size([1, 13064]) Input IDs shape: torch.Size([1, 13064]) Labels shape: torch.Size([1, 13064]) Final batch size: 1, sequence length: 30419 Attention mask shape: torch.Size([1, 1, 30419, 30419]) Position ids shape: torch.Size([1, 30419]) Input IDs shape: torch.Size([1, 30419]) Labels shape: torch.Size([1, 30419]) Final batch size: 1, sequence length: 36532 Attention mask shape: torch.Size([1, 1, 36532, 36532]) Position ids shape: torch.Size([1, 36532]) Input IDs shape: torch.Size([1, 36532]) Labels shape: torch.Size([1, 36532]) {'loss': 0.275, 'grad_norm': 0.3474612334737148, 'learning_rate': 6.294095225512604e-06, 'num_tokens': -inf, 'epoch': 3.75} Final batch size: 1, sequence length: 8845 Attention mask shape: torch.Size([1, 1, 8845, 8845]) Position ids shape: torch.Size([1, 8845]) Input IDs shape: torch.Size([1, 8845]) Labels shape: torch.Size([1, 8845]) Final batch size: 1, sequence length: 7235 Attention mask shape: torch.Size([1, 1, 7235, 7235]) Position ids shape: torch.Size([1, 7235]) Input IDs shape: torch.Size([1, 7235]) Labels shape: torch.Size([1, 7235]) Final batch size: 1, sequence length: 12215 Attention mask shape: torch.Size([1, 1, 12215, 12215]) Position ids shape: torch.Size([1, 12215]) Input IDs shape: torch.Size([1, 12215]) Labels shape: torch.Size([1, 12215]) Final batch size: 1, sequence length: 12830 Attention mask shape: torch.Size([1, 1, 12830, 12830]) Position ids shape: torch.Size([1, 12830]) Input IDs shape: torch.Size([1, 12830]) Labels shape: torch.Size([1, 12830]) Final batch size: 1, sequence length: 13575 Attention mask shape: torch.Size([1, 1, 13575, 13575]) Position ids shape: torch.Size([1, 13575]) Input IDs shape: torch.Size([1, 13575]) Labels shape: torch.Size([1, 13575]) Final batch size: 1, sequence length: 13665 Attention mask shape: torch.Size([1, 1, 13665, 13665]) Position ids shape: torch.Size([1, 13665]) Input IDs shape: torch.Size([1, 13665]) Labels shape: torch.Size([1, 13665]) Final batch size: 1, sequence length: 12562 Attention mask shape: torch.Size([1, 1, 12562, 12562]) Position ids shape: torch.Size([1, 12562]) Input IDs shape: torch.Size([1, 12562]) Labels shape: torch.Size([1, 12562]) Final batch size: 1, sequence length: 12960 Attention mask shape: torch.Size([1, 1, 12960, 12960]) Position ids shape: torch.Size([1, 12960]) Input IDs shape: torch.Size([1, 12960]) Labels shape: torch.Size([1, 12960]) Final batch size: 1, sequence length: 14689 Attention mask shape: torch.Size([1, 1, 14689, 14689]) Position ids shape: torch.Size([1, 14689]) Input IDs shape: torch.Size([1, 14689]) Labels shape: torch.Size([1, 14689]) Final batch size: 1, sequence length: 17016 Attention mask shape: torch.Size([1, 1, 17016, 17016]) Position ids shape: torch.Size([1, 17016]) Input IDs shape: torch.Size([1, 17016]) Labels shape: torch.Size([1, 17016]) Final batch size: 1, sequence length: 16145 Attention mask shape: torch.Size([1, 1, 16145, 16145]) Position ids shape: torch.Size([1, 16145]) Input IDs shape: torch.Size([1, 16145]) Labels shape: torch.Size([1, 16145]) Final batch size: 1, sequence length: 14833 Attention mask shape: torch.Size([1, 1, 14833, 14833]) Position ids shape: torch.Size([1, 14833]) Input IDs shape: torch.Size([1, 14833]) Labels shape: torch.Size([1, 14833]) Final batch size: 1, sequence length: 17026 Attention mask shape: torch.Size([1, 1, 17026, 17026]) Position ids shape: torch.Size([1, 17026]) Input IDs shape: torch.Size([1, 17026]) Labels shape: torch.Size([1, 17026]) Final batch size: 1, sequence length: 14482 Attention mask shape: torch.Size([1, 1, 14482, 14482]) Position ids shape: torch.Size([1, 14482]) Input IDs shape: torch.Size([1, 14482]) Labels shape: torch.Size([1, 14482]) Final batch size: 1, sequence length: 15816 Attention mask shape: torch.Size([1, 1, 15816, 15816]) Position ids shape: torch.Size([1, 15816]) Input IDs shape: torch.Size([1, 15816]) Labels shape: torch.Size([1, 15816]) Final batch size: 1, sequence length: 12622 Attention mask shape: torch.Size([1, 1, 12622, 12622]) Position ids shape: torch.Size([1, 12622]) Input IDs shape: torch.Size([1, 12622]) Labels shape: torch.Size([1, 12622]) Final batch size: 1, sequence length: 18978 Attention mask shape: torch.Size([1, 1, 18978, 18978]) Position ids shape: torch.Size([1, 18978]) Input IDs shape: torch.Size([1, 18978]) Labels shape: torch.Size([1, 18978]) Final batch size: 1, sequence length: 19409 Attention mask shape: torch.Size([1, 1, 19409, 19409]) Position ids shape: torch.Size([1, 19409]) Input IDs shape: torch.Size([1, 19409]) Labels shape: torch.Size([1, 19409]) Final batch size: 1, sequence length: 17512 Attention mask shape: torch.Size([1, 1, 17512, 17512]) Position ids shape: torch.Size([1, 17512]) Input IDs shape: torch.Size([1, 17512]) Labels shape: torch.Size([1, 17512]) Final batch size: 1, sequence length: 15673 Attention mask shape: torch.Size([1, 1, 15673, 15673]) Position ids shape: torch.Size([1, 15673]) Input IDs shape: torch.Size([1, 15673]) Labels shape: torch.Size([1, 15673]) Final batch size: 1, sequence length: 18922 Attention mask shape: torch.Size([1, 1, 18922, 18922]) Position ids shape: torch.Size([1, 18922]) Input IDs shape: torch.Size([1, 18922]) Labels shape: torch.Size([1, 18922]) Final batch size: 1, sequence length: 19225 Attention mask shape: torch.Size([1, 1, 19225, 19225]) Position ids shape: torch.Size([1, 19225]) Input IDs shape: torch.Size([1, 19225]) Labels shape: torch.Size([1, 19225]) Final batch size: 1, sequence length: 17988 Attention mask shape: torch.Size([1, 1, 17988, 17988]) Position ids shape: torch.Size([1, 17988]) Input IDs shape: torch.Size([1, 17988]) Labels shape: torch.Size([1, 17988]) Final batch size: 1, sequence length: 21075 Attention mask shape: torch.Size([1, 1, 21075, 21075]) Position ids shape: torch.Size([1, 21075]) Input IDs shape: torch.Size([1, 21075]) Labels shape: torch.Size([1, 21075]) Final batch size: 1, sequence length: 20389 Attention mask shape: torch.Size([1, 1, 20389, 20389]) Position ids shape: torch.Size([1, 20389]) Input IDs shape: torch.Size([1, 20389]) Labels shape: torch.Size([1, 20389]) Final batch size: 1, sequence length: 20630 Attention mask shape: torch.Size([1, 1, 20630, 20630]) Position ids shape: torch.Size([1, 20630]) Input IDs shape: torch.Size([1, 20630]) Labels shape: torch.Size([1, 20630]) Final batch size: 1, sequence length: 20585 Attention mask shape: torch.Size([1, 1, 20585, 20585]) Position ids shape: torch.Size([1, 20585]) Input IDs shape: torch.Size([1, 20585]) Labels shape: torch.Size([1, 20585]) Final batch size: 1, sequence length: 15222 Attention mask shape: torch.Size([1, 1, 15222, 15222]) Position ids shape: torch.Size([1, 15222]) Input IDs shape: torch.Size([1, 15222]) Labels shape: torch.Size([1, 15222]) Final batch size: 1, sequence length: 10494 Attention mask shape: torch.Size([1, 1, 10494, 10494]) Position ids shape: torch.Size([1, 10494]) Input IDs shape: torch.Size([1, 10494]) Labels shape: torch.Size([1, 10494]) Final batch size: 1, sequence length: 20784 Attention mask shape: torch.Size([1, 1, 20784, 20784]) Position ids shape: torch.Size([1, 20784]) Input IDs shape: torch.Size([1, 20784]) Labels shape: torch.Size([1, 20784]) Final batch size: 1, sequence length: 19325 Attention mask shape: torch.Size([1, 1, 19325, 19325]) Position ids shape: torch.Size([1, 19325]) Input IDs shape: torch.Size([1, 19325]) Labels shape: torch.Size([1, 19325]) Final batch size: 1, sequence length: 6618 Attention mask shape: torch.Size([1, 1, 6618, 6618]) Position ids shape: torch.Size([1, 6618]) Input IDs shape: torch.Size([1, 6618]) Labels shape: torch.Size([1, 6618]) Final batch size: 1, sequence length: 20991 Attention mask shape: torch.Size([1, 1, 20991, 20991]) Position ids shape: torch.Size([1, 20991]) Input IDs shape: torch.Size([1, 20991]) Labels shape: torch.Size([1, 20991]) Final batch size: 1, sequence length: 12969 Attention mask shape: torch.Size([1, 1, 12969, 12969]) Position ids shape: torch.Size([1, 12969]) Input IDs shape: torch.Size([1, 12969]) Labels shape: torch.Size([1, 12969]) Final batch size: 1, sequence length: 14577 Attention mask shape: torch.Size([1, 1, 14577, 14577]) Position ids shape: torch.Size([1, 14577]) Input IDs shape: torch.Size([1, 14577]) Labels shape: torch.Size([1, 14577]) Final batch size: 1, sequence length: 19724 Attention mask shape: torch.Size([1, 1, 19724, 19724]) Position ids shape: torch.Size([1, 19724]) Input IDs shape: torch.Size([1, 19724]) Labels shape: torch.Size([1, 19724]) Final batch size: 1, sequence length: 20465 Attention mask shape: torch.Size([1, 1, 20465, 20465]) Position ids shape: torch.Size([1, 20465]) Input IDs shape: torch.Size([1, 20465]) Labels shape: torch.Size([1, 20465]) Final batch size: 1, sequence length: 13072 Attention mask shape: torch.Size([1, 1, 13072, 13072]) Position ids shape: torch.Size([1, 13072]) Input IDs shape: torch.Size([1, 13072]) Labels shape: torch.Size([1, 13072]) Final batch size: 1, sequence length: 23805 Attention mask shape: torch.Size([1, 1, 23805, 23805]) Position ids shape: torch.Size([1, 23805]) Input IDs shape: torch.Size([1, 23805]) Labels shape: torch.Size([1, 23805]) Final batch size: 1, sequence length: 21598 Attention mask shape: torch.Size([1, 1, 21598, 21598]) Position ids shape: torch.Size([1, 21598]) Input IDs shape: torch.Size([1, 21598]) Labels shape: torch.Size([1, 21598]) Final batch size: 1, sequence length: 22949 Attention mask shape: torch.Size([1, 1, 22949, 22949]) Position ids shape: torch.Size([1, 22949]) Input IDs shape: torch.Size([1, 22949]) Labels shape: torch.Size([1, 22949]) Final batch size: 1, sequence length: 21907 Attention mask shape: torch.Size([1, 1, 21907, 21907]) Position ids shape: torch.Size([1, 21907]) Input IDs shape: torch.Size([1, 21907]) Labels shape: torch.Size([1, 21907]) Final batch size: 1, sequence length: 22851 Attention mask shape: torch.Size([1, 1, 22851, 22851]) Position ids shape: torch.Size([1, 22851]) Input IDs shape: torch.Size([1, 22851]) Labels shape: torch.Size([1, 22851]) Final batch size: 1, sequence length: 22778 Attention mask shape: torch.Size([1, 1, 22778, 22778]) Position ids shape: torch.Size([1, 22778]) Input IDs shape: torch.Size([1, 22778]) Labels shape: torch.Size([1, 22778]) Final batch size: 1, sequence length: 23293 Attention mask shape: torch.Size([1, 1, 23293, 23293]) Position ids shape: torch.Size([1, 23293]) Input IDs shape: torch.Size([1, 23293]) Labels shape: torch.Size([1, 23293]) Final batch size: 1, sequence length: 3010 Attention mask shape: torch.Size([1, 1, 3010, 3010]) Position ids shape: torch.Size([1, 3010]) Input IDs shape: torch.Size([1, 3010]) Labels shape: torch.Size([1, 3010]) Final batch size: 1, sequence length: 26424 Attention mask shape: torch.Size([1, 1, 26424, 26424]) Position ids shape: torch.Size([1, 26424]) Input IDs shape: torch.Size([1, 26424]) Labels shape: torch.Size([1, 26424]) Final batch size: 1, sequence length: 23437 Attention mask shape: torch.Size([1, 1, 23437, 23437]) Position ids shape: torch.Size([1, 23437]) Input IDs shape: torch.Size([1, 23437]) Labels shape: torch.Size([1, 23437]) Final batch size: 1, sequence length: 14548 Attention mask shape: torch.Size([1, 1, 14548, 14548]) Position ids shape: torch.Size([1, 14548]) Input IDs shape: torch.Size([1, 14548]) Labels shape: torch.Size([1, 14548]) Final batch size: 1, sequence length: 28474 Attention mask shape: torch.Size([1, 1, 28474, 28474]) Position ids shape: torch.Size([1, 28474]) Input IDs shape: torch.Size([1, 28474]) Labels shape: torch.Size([1, 28474]) Final batch size: 1, sequence length: 26500 Attention mask shape: torch.Size([1, 1, 26500, 26500]) Position ids shape: torch.Size([1, 26500]) Input IDs shape: torch.Size([1, 26500]) Labels shape: torch.Size([1, 26500]) Final batch size: 1, sequence length: 23755 Attention mask shape: torch.Size([1, 1, 23755, 23755]) Position ids shape: torch.Size([1, 23755]) Input IDs shape: torch.Size([1, 23755]) Labels shape: torch.Size([1, 23755]) Final batch size: 1, sequence length: 28012 Attention mask shape: torch.Size([1, 1, 28012, 28012]) Position ids shape: torch.Size([1, 28012]) Input IDs shape: torch.Size([1, 28012]) Labels shape: torch.Size([1, 28012]) Final batch size: 1, sequence length: 21853 Attention mask shape: torch.Size([1, 1, 21853, 21853]) Position ids shape: torch.Size([1, 21853]) Input IDs shape: torch.Size([1, 21853]) Labels shape: torch.Size([1, 21853]) Final batch size: 1, sequence length: 21322 Attention mask shape: torch.Size([1, 1, 21322, 21322]) Position ids shape: torch.Size([1, 21322]) Input IDs shape: torch.Size([1, 21322]) Labels shape: torch.Size([1, 21322]) Final batch size: 1, sequence length: 23442 Attention mask shape: torch.Size([1, 1, 23442, 23442]) Position ids shape: torch.Size([1, 23442]) Input IDs shape: torch.Size([1, 23442]) Labels shape: torch.Size([1, 23442]) Final batch size: 1, sequence length: 25934 Attention mask shape: torch.Size([1, 1, 25934, 25934]) Position ids shape: torch.Size([1, 25934]) Final batch size: 1, sequence length: 11662Input IDs shape: torch.Size([1, 25934]) Labels shape: torch.Size([1, 25934]) Attention mask shape: torch.Size([1, 1, 11662, 11662]) Position ids shape: torch.Size([1, 11662]) Input IDs shape: torch.Size([1, 11662]) Labels shape: torch.Size([1, 11662]) Final batch size: 1, sequence length: 27384 Attention mask shape: torch.Size([1, 1, 27384, 27384]) Position ids shape: torch.Size([1, 27384]) Input IDs shape: torch.Size([1, 27384]) Labels shape: torch.Size([1, 27384]) Final batch size: 1, sequence length: 24107 Attention mask shape: torch.Size([1, 1, 24107, 24107]) Position ids shape: torch.Size([1, 24107]) Input IDs shape: torch.Size([1, 24107]) Labels shape: torch.Size([1, 24107]) Final batch size: 1, sequence length: 20314 Attention mask shape: torch.Size([1, 1, 20314, 20314]) Position ids shape: torch.Size([1, 20314]) Input IDs shape: torch.Size([1, 20314]) Labels shape: torch.Size([1, 20314]) Final batch size: 1, sequence length: 26454 Attention mask shape: torch.Size([1, 1, 26454, 26454]) Position ids shape: torch.Size([1, 26454]) Input IDs shape: torch.Size([1, 26454]) Labels shape: torch.Size([1, 26454]) Final batch size: 1, sequence length: 30235 Attention mask shape: torch.Size([1, 1, 30235, 30235]) Position ids shape: torch.Size([1, 30235]) Input IDs shape: torch.Size([1, 30235]) Labels shape: torch.Size([1, 30235]) Final batch size: 1, sequence length: 14137 Attention mask shape: torch.Size([1, 1, 14137, 14137]) Position ids shape: torch.Size([1, 14137]) Input IDs shape: torch.Size([1, 14137]) Labels shape: torch.Size([1, 14137]) Final batch size: 1, sequence length: 19027 Attention mask shape: torch.Size([1, 1, 19027, 19027]) Position ids shape: torch.Size([1, 19027]) Input IDs shape: torch.Size([1, 19027]) Labels shape: torch.Size([1, 19027]) Final batch size: 1, sequence length: 24364 Attention mask shape: torch.Size([1, 1, 24364, 24364]) Position ids shape: torch.Size([1, 24364]) Input IDs shape: torch.Size([1, 24364]) Labels shape: torch.Size([1, 24364]) Final batch size: 1, sequence length: 24716 Attention mask shape: torch.Size([1, 1, 24716, 24716]) Position ids shape: torch.Size([1, 24716]) Input IDs shape: torch.Size([1, 24716]) Labels shape: torch.Size([1, 24716]) Final batch size: 1, sequence length: 27520 Attention mask shape: torch.Size([1, 1, 27520, 27520]) Position ids shape: torch.Size([1, 27520]) Input IDs shape: torch.Size([1, 27520]) Labels shape: torch.Size([1, 27520]) Final batch size: 1, sequence length: 20028 Attention mask shape: torch.Size([1, 1, 20028, 20028]) Position ids shape: torch.Size([1, 20028]) Input IDs shape: torch.Size([1, 20028]) Labels shape: torch.Size([1, 20028]) Final batch size: 1, sequence length: 22920 Attention mask shape: torch.Size([1, 1, 22920, 22920]) Position ids shape: torch.Size([1, 22920]) Input IDs shape: torch.Size([1, 22920]) Labels shape: torch.Size([1, 22920]) Final batch size: 1, sequence length: 30041 Attention mask shape: torch.Size([1, 1, 30041, 30041]) Position ids shape: torch.Size([1, 30041]) Input IDs shape: torch.Size([1, 30041]) Labels shape: torch.Size([1, 30041]) Final batch size: 1, sequence length: 12234 Attention mask shape: torch.Size([1, 1, 12234, 12234]) Position ids shape: torch.Size([1, 12234]) Input IDs shape: torch.Size([1, 12234]) Labels shape: torch.Size([1, 12234]) Final batch size: 1, sequence length: 23855 Attention mask shape: torch.Size([1, 1, 23855, 23855]) Position ids shape: torch.Size([1, 23855]) Input IDs shape: torch.Size([1, 23855]) Labels shape: torch.Size([1, 23855]) Final batch size: 1, sequence length: 33765 Attention mask shape: torch.Size([1, 1, 33765, 33765]) Position ids shape: torch.Size([1, 33765]) Input IDs shape: torch.Size([1, 33765]) Labels shape: torch.Size([1, 33765]) Final batch size: 1, sequence length: 12519 Attention mask shape: torch.Size([1, 1, 12519, 12519]) Position ids shape: torch.Size([1, 12519]) Input IDs shape: torch.Size([1, 12519]) Labels shape: torch.Size([1, 12519]) Final batch size: 1, sequence length: 24830 Attention mask shape: torch.Size([1, 1, 24830, 24830]) Position ids shape: torch.Size([1, 24830]) Input IDs shape: torch.Size([1, 24830]) Labels shape: torch.Size([1, 24830]) Final batch size: 1, sequence length: 23695 Attention mask shape: torch.Size([1, 1, 23695, 23695]) Position ids shape: torch.Size([1, 23695]) Input IDs shape: torch.Size([1, 23695]) Labels shape: torch.Size([1, 23695]) Final batch size: 1, sequence length: 24111 Attention mask shape: torch.Size([1, 1, 24111, 24111]) Position ids shape: torch.Size([1, 24111]) Input IDs shape: torch.Size([1, 24111]) Labels shape: torch.Size([1, 24111]) Final batch size: 1, sequence length: 21168 Attention mask shape: torch.Size([1, 1, 21168, 21168]) Position ids shape: torch.Size([1, 21168]) Input IDs shape: torch.Size([1, 21168]) Labels shape: torch.Size([1, 21168]) Final batch size: 1, sequence length: 29385 Attention mask shape: torch.Size([1, 1, 29385, 29385]) Position ids shape: torch.Size([1, 29385]) Input IDs shape: torch.Size([1, 29385]) Labels shape: torch.Size([1, 29385]) Final batch size: 1, sequence length: 27107 Attention mask shape: torch.Size([1, 1, 27107, 27107]) Position ids shape: torch.Size([1, 27107]) Input IDs shape: torch.Size([1, 27107]) Labels shape: torch.Size([1, 27107]) Final batch size: 1, sequence length: 34169 Attention mask shape: torch.Size([1, 1, 34169, 34169]) Position ids shape: torch.Size([1, 34169]) Input IDs shape: torch.Size([1, 34169]) Labels shape: torch.Size([1, 34169]) Final batch size: 1, sequence length: 19302 Attention mask shape: torch.Size([1, 1, 19302, 19302]) Position ids shape: torch.Size([1, 19302]) Input IDs shape: torch.Size([1, 19302]) Labels shape: torch.Size([1, 19302]) Final batch size: 1, sequence length: 29743 Attention mask shape: torch.Size([1, 1, 29743, 29743]) Position ids shape: torch.Size([1, 29743]) Input IDs shape: torch.Size([1, 29743]) Labels shape: torch.Size([1, 29743]) Final batch size: 1, sequence length: 17763 Attention mask shape: torch.Size([1, 1, 17763, 17763]) Position ids shape: torch.Size([1, 17763]) Input IDs shape: torch.Size([1, 17763]) Labels shape: torch.Size([1, 17763]) Final batch size: 1, sequence length: 33179 Attention mask shape: torch.Size([1, 1, 33179, 33179]) Position ids shape: torch.Size([1, 33179]) Input IDs shape: torch.Size([1, 33179]) Labels shape: torch.Size([1, 33179]) Final batch size: 1, sequence length: 38576 Attention mask shape: torch.Size([1, 1, 38576, 38576]) Position ids shape: torch.Size([1, 38576]) Input IDs shape: torch.Size([1, 38576]) Labels shape: torch.Size([1, 38576]) Final batch size: 1, sequence length: 24923 Attention mask shape: torch.Size([1, 1, 24923, 24923]) Position ids shape: torch.Size([1, 24923]) Input IDs shape: torch.Size([1, 24923]) Labels shape: torch.Size([1, 24923]) Final batch size: 1, sequence length: 29842 Attention mask shape: torch.Size([1, 1, 29842, 29842]) Position ids shape: torch.Size([1, 29842]) Input IDs shape: torch.Size([1, 29842]) Labels shape: torch.Size([1, 29842]) Final batch size: 1, sequence length: 33321 Attention mask shape: torch.Size([1, 1, 33321, 33321]) Position ids shape: torch.Size([1, 33321]) Input IDs shape: torch.Size([1, 33321]) Labels shape: torch.Size([1, 33321]) Final batch size: 1, sequence length: 27738 Attention mask shape: torch.Size([1, 1, 27738, 27738]) Position ids shape: torch.Size([1, 27738]) Input IDs shape: torch.Size([1, 27738]) Labels shape: torch.Size([1, 27738]) Final batch size: 1, sequence length: 31404 Attention mask shape: torch.Size([1, 1, 31404, 31404]) Position ids shape: torch.Size([1, 31404]) Input IDs shape: torch.Size([1, 31404]) Labels shape: torch.Size([1, 31404]) Final batch size: 1, sequence length: 10139 Attention mask shape: torch.Size([1, 1, 10139, 10139]) Position ids shape: torch.Size([1, 10139]) Input IDs shape: torch.Size([1, 10139]) Labels shape: torch.Size([1, 10139]) Final batch size: 1, sequence length: 32217 Attention mask shape: torch.Size([1, 1, 32217, 32217]) Position ids shape: torch.Size([1, 32217]) Input IDs shape: torch.Size([1, 32217]) Labels shape: torch.Size([1, 32217]) Final batch size: 1, sequence length: 37657 Attention mask shape: torch.Size([1, 1, 37657, 37657]) Position ids shape: torch.Size([1, 37657]) Input IDs shape: torch.Size([1, 37657]) Labels shape: torch.Size([1, 37657]) Final batch size: 1, sequence length: 40006 Attention mask shape: torch.Size([1, 1, 40006, 40006]) Position ids shape: torch.Size([1, 40006]) Input IDs shape: torch.Size([1, 40006]) Labels shape: torch.Size([1, 40006]) Final batch size: 1, sequence length: 26696 Attention mask shape: torch.Size([1, 1, 26696, 26696]) Position ids shape: torch.Size([1, 26696]) Input IDs shape: torch.Size([1, 26696]) Labels shape: torch.Size([1, 26696]) Final batch size: 1, sequence length: 35555 Attention mask shape: torch.Size([1, 1, 35555, 35555]) Position ids shape: torch.Size([1, 35555]) Input IDs shape: torch.Size([1, 35555]) Labels shape: torch.Size([1, 35555]) Final batch size: 1, sequence length: 24187 Attention mask shape: torch.Size([1, 1, 24187, 24187]) Position ids shape: torch.Size([1, 24187]) Input IDs shape: torch.Size([1, 24187]) Labels shape: torch.Size([1, 24187]) Final batch size: 1, sequence length: 18734 Attention mask shape: torch.Size([1, 1, 18734, 18734]) Position ids shape: torch.Size([1, 18734]) Input IDs shape: torch.Size([1, 18734]) Labels shape: torch.Size([1, 18734]) Final batch size: 1, sequence length: 26685 Attention mask shape: torch.Size([1, 1, 26685, 26685]) Position ids shape: torch.Size([1, 26685]) Input IDs shape: torch.Size([1, 26685]) Labels shape: torch.Size([1, 26685]) Final batch size: 1, sequence length: 39018 Attention mask shape: torch.Size([1, 1, 39018, 39018]) Position ids shape: torch.Size([1, 39018]) Input IDs shape: torch.Size([1, 39018]) Labels shape: torch.Size([1, 39018]) Final batch size: 1, sequence length: 29085 Attention mask shape: torch.Size([1, 1, 29085, 29085]) Position ids shape: torch.Size([1, 29085]) Input IDs shape: torch.Size([1, 29085]) Labels shape: torch.Size([1, 29085]) Final batch size: 1, sequence length: 29286 Attention mask shape: torch.Size([1, 1, 29286, 29286]) Position ids shape: torch.Size([1, 29286]) Input IDs shape: torch.Size([1, 29286]) Labels shape: torch.Size([1, 29286]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 35949 Attention mask shape: torch.Size([1, 1, 35949, 35949]) Position ids shape: torch.Size([1, 35949]) Input IDs shape: torch.Size([1, 35949]) Labels shape: torch.Size([1, 35949]) Final batch size: 1, sequence length: 31991 Attention mask shape: torch.Size([1, 1, 31991, 31991]) Position ids shape: torch.Size([1, 31991]) Input IDs shape: torch.Size([1, 31991]) Labels shape: torch.Size([1, 31991]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 31115 Attention mask shape: torch.Size([1, 1, 31115, 31115]) Position ids shape: torch.Size([1, 31115]) Input IDs shape: torch.Size([1, 31115]) Labels shape: torch.Size([1, 31115]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 19138 Attention mask shape: torch.Size([1, 1, 19138, 19138]) Position ids shape: torch.Size([1, 19138]) Input IDs shape: torch.Size([1, 19138]) Labels shape: torch.Size([1, 19138]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 25990 Attention mask shape: torch.Size([1, 1, 25990, 25990]) Position ids shape: torch.Size([1, 25990]) Input IDs shape: torch.Size([1, 25990]) Labels shape: torch.Size([1, 25990]) Final batch size: 1, sequence length: 23848 Attention mask shape: torch.Size([1, 1, 23848, 23848]) Position ids shape: torch.Size([1, 23848]) Input IDs shape: torch.Size([1, 23848]) Labels shape: torch.Size([1, 23848]) Final batch size: 1, sequence length: 22500 Attention mask shape: torch.Size([1, 1, 22500, 22500]) Position ids shape: torch.Size([1, 22500]) Input IDs shape: torch.Size([1, 22500]) Labels shape: torch.Size([1, 22500]) Final batch size: 1, sequence length: 32762 Attention mask shape: torch.Size([1, 1, 32762, 32762]) Position ids shape: torch.Size([1, 32762]) Input IDs shape: torch.Size([1, 32762]) Labels shape: torch.Size([1, 32762]) Final batch size: 1, sequence length: 26372 Attention mask shape: torch.Size([1, 1, 26372, 26372]) Position ids shape: torch.Size([1, 26372]) Input IDs shape: torch.Size([1, 26372]) Labels shape: torch.Size([1, 26372]) Final batch size: 1, sequence length: 40834 Attention mask shape: torch.Size([1, 1, 40834, 40834]) Position ids shape: torch.Size([1, 40834]) Input IDs shape: torch.Size([1, 40834]) Labels shape: torch.Size([1, 40834]) Final batch size: 1, sequence length: 22778 Attention mask shape: torch.Size([1, 1, 22778, 22778]) Position ids shape: torch.Size([1, 22778]) Input IDs shape: torch.Size([1, 22778]) Labels shape: torch.Size([1, 22778]) Final batch size: 1, sequence length: 37680 Attention mask shape: torch.Size([1, 1, 37680, 37680]) Position ids shape: torch.Size([1, 37680]) Input IDs shape: torch.Size([1, 37680]) Labels shape: torch.Size([1, 37680]) Final batch size: 1, sequence length: 32675 Attention mask shape: torch.Size([1, 1, 32675, 32675]) Position ids shape: torch.Size([1, 32675]) Input IDs shape: torch.Size([1, 32675]) Labels shape: torch.Size([1, 32675]) Final batch size: 1, sequence length: 34628 Attention mask shape: torch.Size([1, 1, 34628, 34628]) Position ids shape: torch.Size([1, 34628]) Input IDs shape: torch.Size([1, 34628]) Labels shape: torch.Size([1, 34628]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 34791 Attention mask shape: torch.Size([1, 1, 34791, 34791]) Position ids shape: torch.Size([1, 34791]) Input IDs shape: torch.Size([1, 34791]) Labels shape: torch.Size([1, 34791]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) {'loss': 0.2687, 'grad_norm': 0.33829235703869354, 'learning_rate': 6.039558454088796e-06, 'num_tokens': -inf, 'epoch': 3.88} Final batch size: 1, sequence length: 5686 Attention mask shape: torch.Size([1, 1, 5686, 5686]) Position ids shape: torch.Size([1, 5686]) Input IDs shape: torch.Size([1, 5686]) Labels shape: torch.Size([1, 5686]) Final batch size: 1, sequence length: 6986 Attention mask shape: torch.Size([1, 1, 6986, 6986]) Position ids shape: torch.Size([1, 6986]) Input IDs shape: torch.Size([1, 6986]) Labels shape: torch.Size([1, 6986]) Final batch size: 1, sequence length: 11304 Attention mask shape: torch.Size([1, 1, 11304, 11304]) Position ids shape: torch.Size([1, 11304]) Input IDs shape: torch.Size([1, 11304]) Labels shape: torch.Size([1, 11304]) Final batch size: 1, sequence length: 14168 Attention mask shape: torch.Size([1, 1, 14168, 14168]) Position ids shape: torch.Size([1, 14168]) Input IDs shape: torch.Size([1, 14168]) Labels shape: torch.Size([1, 14168]) Final batch size: 1, sequence length: 13872 Attention mask shape: torch.Size([1, 1, 13872, 13872]) Position ids shape: torch.Size([1, 13872]) Input IDs shape: torch.Size([1, 13872]) Labels shape: torch.Size([1, 13872]) Final batch size: 1, sequence length: 14304 Attention mask shape: torch.Size([1, 1, 14304, 14304]) Position ids shape: torch.Size([1, 14304]) Input IDs shape: torch.Size([1, 14304]) Labels shape: torch.Size([1, 14304]) Final batch size: 1, sequence length: 12847 Attention mask shape: torch.Size([1, 1, 12847, 12847]) Position ids shape: torch.Size([1, 12847]) Input IDs shape: torch.Size([1, 12847]) Labels shape: torch.Size([1, 12847]) Final batch size: 1, sequence length: 13224 Attention mask shape: torch.Size([1, 1, 13224, 13224]) Position ids shape: torch.Size([1, 13224]) Input IDs shape: torch.Size([1, 13224]) Labels shape: torch.Size([1, 13224]) Final batch size: 1, sequence length: 16440 Attention mask shape: torch.Size([1, 1, 16440, 16440]) Position ids shape: torch.Size([1, 16440]) Input IDs shape: torch.Size([1, 16440]) Labels shape: torch.Size([1, 16440]) Final batch size: 1, sequence length: 16514 Attention mask shape: torch.Size([1, 1, 16514, 16514]) Position ids shape: torch.Size([1, 16514]) Input IDs shape: torch.Size([1, 16514]) Labels shape: torch.Size([1, 16514]) Final batch size: 1, sequence length: 18246 Attention mask shape: torch.Size([1, 1, 18246, 18246]) Position ids shape: torch.Size([1, 18246]) Input IDs shape: torch.Size([1, 18246]) Labels shape: torch.Size([1, 18246]) Final batch size: 1, sequence length: 17852 Attention mask shape: torch.Size([1, 1, 17852, 17852]) Position ids shape: torch.Size([1, 17852]) Input IDs shape: torch.Size([1, 17852]) Labels shape: torch.Size([1, 17852]) Final batch size: 1, sequence length: 15652 Attention mask shape: torch.Size([1, 1, 15652, 15652]) Position ids shape: torch.Size([1, 15652]) Input IDs shape: torch.Size([1, 15652]) Labels shape: torch.Size([1, 15652]) Final batch size: 1, sequence length: 18044 Attention mask shape: torch.Size([1, 1, 18044, 18044]) Position ids shape: torch.Size([1, 18044]) Input IDs shape: torch.Size([1, 18044]) Labels shape: torch.Size([1, 18044]) Final batch size: 1, sequence length: 15621 Attention mask shape: torch.Size([1, 1, 15621, 15621]) Position ids shape: torch.Size([1, 15621]) Input IDs shape: torch.Size([1, 15621]) Labels shape: torch.Size([1, 15621]) Final batch size: 1, sequence length: 16501 Attention mask shape: torch.Size([1, 1, 16501, 16501]) Position ids shape: torch.Size([1, 16501]) Input IDs shape: torch.Size([1, 16501]) Labels shape: torch.Size([1, 16501]) Final batch size: 1, sequence length: 17773 Attention mask shape: torch.Size([1, 1, 17773, 17773]) Position ids shape: torch.Size([1, 17773]) Input IDs shape: torch.Size([1, 17773]) Labels shape: torch.Size([1, 17773]) Final batch size: 1, sequence length: 16378 Attention mask shape: torch.Size([1, 1, 16378, 16378]) Position ids shape: torch.Size([1, 16378]) Input IDs shape: torch.Size([1, 16378]) Labels shape: torch.Size([1, 16378]) Final batch size: 1, sequence length: 17213 Attention mask shape: torch.Size([1, 1, 17213, 17213]) Position ids shape: torch.Size([1, 17213]) Input IDs shape: torch.Size([1, 17213]) Labels shape: torch.Size([1, 17213]) Final batch size: 1, sequence length: 21163 Attention mask shape: torch.Size([1, 1, 21163, 21163]) Position ids shape: torch.Size([1, 21163]) Input IDs shape: torch.Size([1, 21163]) Labels shape: torch.Size([1, 21163]) Final batch size: 1, sequence length: 20176 Attention mask shape: torch.Size([1, 1, 20176, 20176]) Position ids shape: torch.Size([1, 20176]) Input IDs shape: torch.Size([1, 20176]) Labels shape: torch.Size([1, 20176]) Final batch size: 1, sequence length: 20766 Attention mask shape: torch.Size([1, 1, 20766, 20766]) Position ids shape: torch.Size([1, 20766]) Input IDs shape: torch.Size([1, 20766]) Labels shape: torch.Size([1, 20766]) Final batch size: 1, sequence length: 18155 Attention mask shape: torch.Size([1, 1, 18155, 18155]) Position ids shape: torch.Size([1, 18155]) Input IDs shape: torch.Size([1, 18155]) Labels shape: torch.Size([1, 18155]) Final batch size: 1, sequence length: 21316 Attention mask shape: torch.Size([1, 1, 21316, 21316]) Position ids shape: torch.Size([1, 21316]) Input IDs shape: torch.Size([1, 21316]) Labels shape: torch.Size([1, 21316]) Final batch size: 1, sequence length: 18289 Attention mask shape: torch.Size([1, 1, 18289, 18289]) Position ids shape: torch.Size([1, 18289]) Input IDs shape: torch.Size([1, 18289]) Labels shape: torch.Size([1, 18289]) Final batch size: 1, sequence length: 20813 Attention mask shape: torch.Size([1, 1, 20813, 20813]) Position ids shape: torch.Size([1, 20813]) Input IDs shape: torch.Size([1, 20813]) Labels shape: torch.Size([1, 20813]) Final batch size: 1, sequence length: 21607 Attention mask shape: torch.Size([1, 1, 21607, 21607]) Position ids shape: torch.Size([1, 21607]) Input IDs shape: torch.Size([1, 21607]) Labels shape: torch.Size([1, 21607]) Final batch size: 1, sequence length: 22152 Attention mask shape: torch.Size([1, 1, 22152, 22152]) Position ids shape: torch.Size([1, 22152]) Input IDs shape: torch.Size([1, 22152]) Labels shape: torch.Size([1, 22152]) Final batch size: 1, sequence length: 22117 Attention mask shape: torch.Size([1, 1, 22117, 22117]) Position ids shape: torch.Size([1, 22117]) Input IDs shape: torch.Size([1, 22117]) Labels shape: torch.Size([1, 22117]) Final batch size: 1, sequence length: 21705 Attention mask shape: torch.Size([1, 1, 21705, 21705]) Position ids shape: torch.Size([1, 21705]) Input IDs shape: torch.Size([1, 21705]) Labels shape: torch.Size([1, 21705]) Final batch size: 1, sequence length: 21207 Attention mask shape: torch.Size([1, 1, 21207, 21207]) Position ids shape: torch.Size([1, 21207]) Input IDs shape: torch.Size([1, 21207]) Labels shape: torch.Size([1, 21207]) Final batch size: 1, sequence length: 22784 Attention mask shape: torch.Size([1, 1, 22784, 22784]) Position ids shape: torch.Size([1, 22784]) Input IDs shape: torch.Size([1, 22784]) Labels shape: torch.Size([1, 22784]) Final batch size: 1, sequence length: 25880 Attention mask shape: torch.Size([1, 1, 25880, 25880]) Position ids shape: torch.Size([1, 25880]) Input IDs shape: torch.Size([1, 25880]) Labels shape: torch.Size([1, 25880]) Final batch size: 1, sequence length: 21562 Attention mask shape: torch.Size([1, 1, 21562, 21562]) Position ids shape: torch.Size([1, 21562]) Input IDs shape: torch.Size([1, 21562]) Labels shape: torch.Size([1, 21562]) Final batch size: 1, sequence length: 22197 Attention mask shape: torch.Size([1, 1, 22197, 22197]) Position ids shape: torch.Size([1, 22197]) Input IDs shape: torch.Size([1, 22197]) Labels shape: torch.Size([1, 22197]) Final batch size: 1, sequence length: 26494 Attention mask shape: torch.Size([1, 1, 26494, 26494]) Position ids shape: torch.Size([1, 26494]) Input IDs shape: torch.Size([1, 26494]) Labels shape: torch.Size([1, 26494]) Final batch size: 1, sequence length: 25053 Attention mask shape: torch.Size([1, 1, 25053, 25053]) Position ids shape: torch.Size([1, 25053]) Input IDs shape: torch.Size([1, 25053]) Labels shape: torch.Size([1, 25053]) Final batch size: 1, sequence length: 26303 Attention mask shape: torch.Size([1, 1, 26303, 26303]) Position ids shape: torch.Size([1, 26303]) Input IDs shape: torch.Size([1, 26303]) Labels shape: torch.Size([1, 26303]) Final batch size: 1, sequence length: 27005 Attention mask shape: torch.Size([1, 1, 27005, 27005]) Position ids shape: torch.Size([1, 27005]) Input IDs shape: torch.Size([1, 27005]) Labels shape: torch.Size([1, 27005]) Final batch size: 1, sequence length: 26921 Attention mask shape: torch.Size([1, 1, 26921, 26921]) Position ids shape: torch.Size([1, 26921]) Input IDs shape: torch.Size([1, 26921]) Labels shape: torch.Size([1, 26921]) Final batch size: 1, sequence length: 28523 Attention mask shape: torch.Size([1, 1, 28523, 28523]) Position ids shape: torch.Size([1, 28523]) Input IDs shape: torch.Size([1, 28523]) Labels shape: torch.Size([1, 28523]) Final batch size: 1, sequence length: 30151 Attention mask shape: torch.Size([1, 1, 30151, 30151]) Position ids shape: torch.Size([1, 30151]) Input IDs shape: torch.Size([1, 30151]) Labels shape: torch.Size([1, 30151]) Final batch size: 1, sequence length: 30592 Attention mask shape: torch.Size([1, 1, 30592, 30592]) Position ids shape: torch.Size([1, 30592]) Input IDs shape: torch.Size([1, 30592]) Labels shape: torch.Size([1, 30592]) Final batch size: 1, sequence length: 32473 Attention mask shape: torch.Size([1, 1, 32473, 32473]) Position ids shape: torch.Size([1, 32473]) Input IDs shape: torch.Size([1, 32473]) Labels shape: torch.Size([1, 32473]) Final batch size: 1, sequence length: 32596 Attention mask shape: torch.Size([1, 1, 32596, 32596]) Position ids shape: torch.Size([1, 32596]) Input IDs shape: torch.Size([1, 32596]) Labels shape: torch.Size([1, 32596]) Final batch size: 1, sequence length: 33482 Attention mask shape: torch.Size([1, 1, 33482, 33482]) Position ids shape: torch.Size([1, 33482]) Input IDs shape: torch.Size([1, 33482]) Labels shape: torch.Size([1, 33482]) Final batch size: 1, sequence length: 35030 Attention mask shape: torch.Size([1, 1, 35030, 35030]) Position ids shape: torch.Size([1, 35030]) Input IDs shape: torch.Size([1, 35030]) Labels shape: torch.Size([1, 35030]) Final batch size: 1, sequence length: 28897 Attention mask shape: torch.Size([1, 1, 28897, 28897]) Position ids shape: torch.Size([1, 28897]) Input IDs shape: torch.Size([1, 28897]) Labels shape: torch.Size([1, 28897]) Final batch size: 1, sequence length: 35100 Attention mask shape: torch.Size([1, 1, 35100, 35100]) Position ids shape: torch.Size([1, 35100]) Input IDs shape: torch.Size([1, 35100]) Labels shape: torch.Size([1, 35100]) Final batch size: 1, sequence length: 35485 Attention mask shape: torch.Size([1, 1, 35485, 35485]) Position ids shape: torch.Size([1, 35485]) Input IDs shape: torch.Size([1, 35485]) Labels shape: torch.Size([1, 35485]) Final batch size: 1, sequence length: 35866 Attention mask shape: torch.Size([1, 1, 35866, 35866]) Position ids shape: torch.Size([1, 35866]) Input IDs shape: torch.Size([1, 35866]) Labels shape: torch.Size([1, 35866]) Final batch size: 1, sequence length: 35523 Attention mask shape: torch.Size([1, 1, 35523, 35523]) Position ids shape: torch.Size([1, 35523]) Input IDs shape: torch.Size([1, 35523]) Labels shape: torch.Size([1, 35523]) Final batch size: 1, sequence length: 40526 Attention mask shape: torch.Size([1, 1, 40526, 40526]) Position ids shape: torch.Size([1, 40526]) Input IDs shape: torch.Size([1, 40526]) Labels shape: torch.Size([1, 40526]) Final batch size: 1, sequence length: 34469 Attention mask shape: torch.Size([1, 1, 34469, 34469]) Position ids shape: torch.Size([1, 34469]) Input IDs shape: torch.Size([1, 34469]) Labels shape: torch.Size([1, 34469]) Final batch size: 1, sequence length: 37040 Attention mask shape: torch.Size([1, 1, 37040, 37040]) Position ids shape: torch.Size([1, 37040]) Input IDs shape: torch.Size([1, 37040]) Labels shape: torch.Size([1, 37040]) Final batch size: 1, sequence length: 37980 Attention mask shape: torch.Size([1, 1, 37980, 37980]) Position ids shape: torch.Size([1, 37980]) Input IDs shape: torch.Size([1, 37980]) Labels shape: torch.Size([1, 37980]) Final batch size: 1, sequence length: 39589 Attention mask shape: torch.Size([1, 1, 39589, 39589]) Position ids shape: torch.Size([1, 39589]) Input IDs shape: torch.Size([1, 39589]) Labels shape: torch.Size([1, 39589]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 38927 Attention mask shape: torch.Size([1, 1, 38927, 38927]) Position ids shape: torch.Size([1, 38927]) Input IDs shape: torch.Size([1, 38927]) Labels shape: torch.Size([1, 38927]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) {'loss': 0.2623, 'grad_norm': 0.32081343551863933, 'learning_rate': 5.782172325201155e-06, 'num_tokens': -inf, 'epoch': 4.0} Final batch size: 1, sequence length: 5686 Attention mask shape: torch.Size([1, 1, 5686, 5686]) Position ids shape: torch.Size([1, 5686]) Input IDs shape: torch.Size([1, 5686]) Labels shape: torch.Size([1, 5686]) Final batch size: 1, sequence length: 6986 Attention mask shape: torch.Size([1, 1, 6986, 6986]) Position ids shape: torch.Size([1, 6986]) Input IDs shape: torch.Size([1, 6986]) Labels shape: torch.Size([1, 6986]) Final batch size: 1, sequence length: 6362 Attention mask shape: torch.Size([1, 1, 6362, 6362]) Position ids shape: torch.Size([1, 6362]) Input IDs shape: torch.Size([1, 6362]) Labels shape: torch.Size([1, 6362]) Final batch size: 1, sequence length: 10523 Attention mask shape: torch.Size([1, 1, 10523, 10523]) Position ids shape: torch.Size([1, 10523]) Input IDs shape: torch.Size([1, 10523]) Labels shape: torch.Size([1, 10523]) Final batch size: 1, sequence length: 11304 Attention mask shape: torch.Size([1, 1, 11304, 11304]) Position ids shape: torch.Size([1, 11304]) Input IDs shape: torch.Size([1, 11304]) Labels shape: torch.Size([1, 11304]) Final batch size: 1, sequence length: 14304 Attention mask shape: torch.Size([1, 1, 14304, 14304]) Position ids shape: torch.Size([1, 14304]) Input IDs shape: torch.Size([1, 14304]) Labels shape: torch.Size([1, 14304]) Final batch size: 1, sequence length: 13872 Attention mask shape: torch.Size([1, 1, 13872, 13872]) Position ids shape: torch.Size([1, 13872]) Input IDs shape: torch.Size([1, 13872]) Labels shape: torch.Size([1, 13872]) Final batch size: 1, sequence length: 15652 Attention mask shape: torch.Size([1, 1, 15652, 15652]) Position ids shape: torch.Size([1, 15652]) Input IDs shape: torch.Size([1, 15652]) Labels shape: torch.Size([1, 15652]) Final batch size: 1, sequence length: 15621 Attention mask shape: torch.Size([1, 1, 15621, 15621]) Position ids shape: torch.Size([1, 15621]) Input IDs shape: torch.Size([1, 15621]) Labels shape: torch.Size([1, 15621]) Final batch size: 1, sequence length: 16378 Attention mask shape: torch.Size([1, 1, 16378, 16378]) Position ids shape: torch.Size([1, 16378]) Input IDs shape: torch.Size([1, 16378]) Labels shape: torch.Size([1, 16378]) Final batch size: 1, sequence length: 14168 Attention mask shape: torch.Size([1, 1, 14168, 14168]) Position ids shape: torch.Size([1, 14168]) Input IDs shape: torch.Size([1, 14168]) Labels shape: torch.Size([1, 14168]) Final batch size: 1, sequence length: 12847 Attention mask shape: torch.Size([1, 1, 12847, 12847]) Position ids shape: torch.Size([1, 12847]) Input IDs shape: torch.Size([1, 12847]) Labels shape: torch.Size([1, 12847]) Final batch size: 1, sequence length: 17213 Attention mask shape: torch.Size([1, 1, 17213, 17213]) Position ids shape: torch.Size([1, 17213]) Input IDs shape: torch.Size([1, 17213]) Labels shape: torch.Size([1, 17213]) Final batch size: 1, sequence length: 13224 Attention mask shape: torch.Size([1, 1, 13224, 13224]) Position ids shape: torch.Size([1, 13224]) Input IDs shape: torch.Size([1, 13224]) Labels shape: torch.Size([1, 13224]) Final batch size: 1, sequence length: 18044 Attention mask shape: torch.Size([1, 1, 18044, 18044]) Position ids shape: torch.Size([1, 18044]) Input IDs shape: torch.Size([1, 18044]) Labels shape: torch.Size([1, 18044]) Final batch size: 1, sequence length: 17852 Attention mask shape: torch.Size([1, 1, 17852, 17852]) Position ids shape: torch.Size([1, 17852]) Input IDs shape: torch.Size([1, 17852]) Labels shape: torch.Size([1, 17852]) Final batch size: 1, sequence length: 16440 Attention mask shape: torch.Size([1, 1, 16440, 16440]) Position ids shape: torch.Size([1, 16440]) Input IDs shape: torch.Size([1, 16440]) Labels shape: torch.Size([1, 16440]) Final batch size: 1, sequence length: 16501 Attention mask shape: torch.Size([1, 1, 16501, 16501]) Position ids shape: torch.Size([1, 16501]) Input IDs shape: torch.Size([1, 16501]) Labels shape: torch.Size([1, 16501]) Final batch size: 1, sequence length: 18246 Attention mask shape: torch.Size([1, 1, 18246, 18246]) Position ids shape: torch.Size([1, 18246]) Input IDs shape: torch.Size([1, 18246]) Labels shape: torch.Size([1, 18246]) Final batch size: 1, sequence length: 18155 Attention mask shape: torch.Size([1, 1, 18155, 18155]) Position ids shape: torch.Size([1, 18155]) Input IDs shape: torch.Size([1, 18155]) Labels shape: torch.Size([1, 18155]) Final batch size: 1, sequence length: 20176 Attention mask shape: torch.Size([1, 1, 20176, 20176]) Position ids shape: torch.Size([1, 20176]) Input IDs shape: torch.Size([1, 20176]) Labels shape: torch.Size([1, 20176]) Final batch size: 1, sequence length: 5911 Attention mask shape: torch.Size([1, 1, 5911, 5911]) Position ids shape: torch.Size([1, 5911]) Input IDs shape: torch.Size([1, 5911]) Labels shape: torch.Size([1, 5911]) Final batch size: 1, sequence length: 16514 Attention mask shape: torch.Size([1, 1, 16514, 16514]) Position ids shape: torch.Size([1, 16514]) Input IDs shape: torch.Size([1, 16514]) Labels shape: torch.Size([1, 16514]) Final batch size: 1, sequence length: 18289 Attention mask shape: torch.Size([1, 1, 18289, 18289]) Position ids shape: torch.Size([1, 18289]) Input IDs shape: torch.Size([1, 18289]) Labels shape: torch.Size([1, 18289]) Final batch size: 1, sequence length: 21562 Attention mask shape: torch.Size([1, 1, 21562, 21562]) Position ids shape: torch.Size([1, 21562]) Input IDs shape: torch.Size([1, 21562]) Labels shape: torch.Size([1, 21562]) Final batch size: 1, sequence length: 21163 Attention mask shape: torch.Size([1, 1, 21163, 21163]) Position ids shape: torch.Size([1, 21163]) Input IDs shape: torch.Size([1, 21163]) Labels shape: torch.Size([1, 21163]) Final batch size: 1, sequence length: 21705 Attention mask shape: torch.Size([1, 1, 21705, 21705]) Position ids shape: torch.Size([1, 21705]) Input IDs shape: torch.Size([1, 21705]) Labels shape: torch.Size([1, 21705]) Final batch size: 1, sequence length: 21316 Attention mask shape: torch.Size([1, 1, 21316, 21316]) Position ids shape: torch.Size([1, 21316]) Input IDs shape: torch.Size([1, 21316]) Labels shape: torch.Size([1, 21316]) Final batch size: 1, sequence length: 20813 Attention mask shape: torch.Size([1, 1, 20813, 20813]) Position ids shape: torch.Size([1, 20813]) Input IDs shape: torch.Size([1, 20813]) Labels shape: torch.Size([1, 20813]) Final batch size: 1, sequence length: 17773 Attention mask shape: torch.Size([1, 1, 17773, 17773]) Position ids shape: torch.Size([1, 17773]) Input IDs shape: torch.Size([1, 17773]) Labels shape: torch.Size([1, 17773]) Final batch size: 1, sequence length: 21207 Attention mask shape: torch.Size([1, 1, 21207, 21207]) Position ids shape: torch.Size([1, 21207]) Input IDs shape: torch.Size([1, 21207]) Labels shape: torch.Size([1, 21207]) Final batch size: 1, sequence length: 20766 Attention mask shape: torch.Size([1, 1, 20766, 20766]) Position ids shape: torch.Size([1, 20766]) Input IDs shape: torch.Size([1, 20766]) Labels shape: torch.Size([1, 20766]) Final batch size: 1, sequence length: 18545 Attention mask shape: torch.Size([1, 1, 18545, 18545]) Position ids shape: torch.Size([1, 18545]) Input IDs shape: torch.Size([1, 18545]) Labels shape: torch.Size([1, 18545]) Final batch size: 1, sequence length: 22117 Attention mask shape: torch.Size([1, 1, 22117, 22117]) Position ids shape: torch.Size([1, 22117]) Input IDs shape: torch.Size([1, 22117]) Labels shape: torch.Size([1, 22117]) Final batch size: 1, sequence length: 17910 Attention mask shape: torch.Size([1, 1, 17910, 17910]) Position ids shape: torch.Size([1, 17910]) Input IDs shape: torch.Size([1, 17910]) Labels shape: torch.Size([1, 17910]) Final batch size: 1, sequence length: 21607 Attention mask shape: torch.Size([1, 1, 21607, 21607]) Position ids shape: torch.Size([1, 21607]) Input IDs shape: torch.Size([1, 21607]) Labels shape: torch.Size([1, 21607]) Final batch size: 1, sequence length: 22152 Attention mask shape: torch.Size([1, 1, 22152, 22152]) Position ids shape: torch.Size([1, 22152]) Input IDs shape: torch.Size([1, 22152]) Labels shape: torch.Size([1, 22152]) Final batch size: 1, sequence length: 22197 Attention mask shape: torch.Size([1, 1, 22197, 22197]) Position ids shape: torch.Size([1, 22197]) Input IDs shape: torch.Size([1, 22197]) Labels shape: torch.Size([1, 22197]) Final batch size: 1, sequence length: 22784 Attention mask shape: torch.Size([1, 1, 22784, 22784]) Position ids shape: torch.Size([1, 22784]) Input IDs shape: torch.Size([1, 22784]) Labels shape: torch.Size([1, 22784]) Final batch size: 1, sequence length: 12421 Attention mask shape: torch.Size([1, 1, 12421, 12421]) Position ids shape: torch.Size([1, 12421]) Input IDs shape: torch.Size([1, 12421]) Labels shape: torch.Size([1, 12421]) Final batch size: 1, sequence length: 20101 Attention mask shape: torch.Size([1, 1, 20101, 20101]) Position ids shape: torch.Size([1, 20101]) Input IDs shape: torch.Size([1, 20101]) Labels shape: torch.Size([1, 20101]) Final batch size: 1, sequence length: 25053 Attention mask shape: torch.Size([1, 1, 25053, 25053]) Position ids shape: torch.Size([1, 25053]) Input IDs shape: torch.Size([1, 25053]) Labels shape: torch.Size([1, 25053]) Final batch size: 1, sequence length: 14496 Attention mask shape: torch.Size([1, 1, 14496, 14496]) Position ids shape: torch.Size([1, 14496]) Input IDs shape: torch.Size([1, 14496]) Labels shape: torch.Size([1, 14496]) Final batch size: 1, sequence length: 21962 Attention mask shape: torch.Size([1, 1, 21962, 21962]) Position ids shape: torch.Size([1, 21962]) Input IDs shape: torch.Size([1, 21962]) Labels shape: torch.Size([1, 21962]) Final batch size: 1, sequence length: 23338 Attention mask shape: torch.Size([1, 1, 23338, 23338]) Position ids shape: torch.Size([1, 23338]) Input IDs shape: torch.Size([1, 23338]) Labels shape: torch.Size([1, 23338]) Final batch size: 1, sequence length: 21672 Attention mask shape: torch.Size([1, 1, 21672, 21672]) Position ids shape: torch.Size([1, 21672]) Input IDs shape: torch.Size([1, 21672]) Labels shape: torch.Size([1, 21672]) Final batch size: 1, sequence length: 24365 Final batch size: 1, sequence length: 12224 Attention mask shape: torch.Size([1, 1, 24365, 24365]) Position ids shape: torch.Size([1, 24365]) Input IDs shape: torch.Size([1, 24365]) Labels shape: torch.Size([1, 24365]) Attention mask shape: torch.Size([1, 1, 12224, 12224]) Position ids shape: torch.Size([1, 12224]) Input IDs shape: torch.Size([1, 12224]) Labels shape: torch.Size([1, 12224]) Final batch size: 1, sequence length: 26494 Attention mask shape: torch.Size([1, 1, 26494, 26494]) Position ids shape: torch.Size([1, 26494]) Input IDs shape: torch.Size([1, 26494]) Labels shape: torch.Size([1, 26494]) Final batch size: 1, sequence length: 27633 Attention mask shape: torch.Size([1, 1, 27633, 27633]) Position ids shape: torch.Size([1, 27633]) Input IDs shape: torch.Size([1, 27633]) Labels shape: torch.Size([1, 27633]) Final batch size: 1, sequence length: 13031 Attention mask shape: torch.Size([1, 1, 13031, 13031]) Position ids shape: torch.Size([1, 13031]) Input IDs shape: torch.Size([1, 13031]) Labels shape: torch.Size([1, 13031]) Final batch size: 1, sequence length: 18470 Attention mask shape: torch.Size([1, 1, 18470, 18470]) Position ids shape: torch.Size([1, 18470]) Input IDs shape: torch.Size([1, 18470]) Labels shape: torch.Size([1, 18470]) Final batch size: 1, sequence length: 16057 Attention mask shape: torch.Size([1, 1, 16057, 16057]) Position ids shape: torch.Size([1, 16057]) Input IDs shape: torch.Size([1, 16057]) Labels shape: torch.Size([1, 16057]) Final batch size: 1, sequence length: 21766 Attention mask shape: torch.Size([1, 1, 21766, 21766]) Position ids shape: torch.Size([1, 21766]) Input IDs shape: torch.Size([1, 21766]) Labels shape: torch.Size([1, 21766]) Final batch size: 1, sequence length: 26303 Attention mask shape: torch.Size([1, 1, 26303, 26303]) Position ids shape: torch.Size([1, 26303]) Input IDs shape: torch.Size([1, 26303]) Labels shape: torch.Size([1, 26303]) Final batch size: 1, sequence length: 26921 Attention mask shape: torch.Size([1, 1, 26921, 26921]) Position ids shape: torch.Size([1, 26921]) Input IDs shape: torch.Size([1, 26921]) Labels shape: torch.Size([1, 26921]) Final batch size: 1, sequence length: 30592 Attention mask shape: torch.Size([1, 1, 30592, 30592]) Position ids shape: torch.Size([1, 30592]) Input IDs shape: torch.Size([1, 30592]) Labels shape: torch.Size([1, 30592]) Final batch size: 1, sequence length: 19962 Attention mask shape: torch.Size([1, 1, 19962, 19962]) Position ids shape: torch.Size([1, 19962]) Input IDs shape: torch.Size([1, 19962]) Labels shape: torch.Size([1, 19962]) Final batch size: 1, sequence length: 28523 Attention mask shape: torch.Size([1, 1, 28523, 28523]) Position ids shape: torch.Size([1, 28523]) Input IDs shape: torch.Size([1, 28523]) Labels shape: torch.Size([1, 28523]) Final batch size: 1, sequence length: 23975 Attention mask shape: torch.Size([1, 1, 23975, 23975]) Position ids shape: torch.Size([1, 23975]) Input IDs shape: torch.Size([1, 23975]) Labels shape: torch.Size([1, 23975]) Final batch size: 1, sequence length: 11515 Attention mask shape: torch.Size([1, 1, 11515, 11515]) Position ids shape: torch.Size([1, 11515]) Input IDs shape: torch.Size([1, 11515]) Labels shape: torch.Size([1, 11515]) Final batch size: 1, sequence length: 26138 Attention mask shape: torch.Size([1, 1, 26138, 26138]) Position ids shape: torch.Size([1, 26138]) Input IDs shape: torch.Size([1, 26138]) Labels shape: torch.Size([1, 26138]) Final batch size: 1, sequence length: 32473 Attention mask shape: torch.Size([1, 1, 32473, 32473]) Position ids shape: torch.Size([1, 32473]) Input IDs shape: torch.Size([1, 32473]) Labels shape: torch.Size([1, 32473]) Final batch size: 1, sequence length: 29404 Attention mask shape: torch.Size([1, 1, 29404, 29404]) Position ids shape: torch.Size([1, 29404]) Input IDs shape: torch.Size([1, 29404]) Labels shape: torch.Size([1, 29404]) Final batch size: 1, sequence length: 21581 Attention mask shape: torch.Size([1, 1, 21581, 21581]) Position ids shape: torch.Size([1, 21581]) Input IDs shape: torch.Size([1, 21581]) Labels shape: torch.Size([1, 21581]) Final batch size: 1, sequence length: 28897 Attention mask shape: torch.Size([1, 1, 28897, 28897]) Position ids shape: torch.Size([1, 28897]) Input IDs shape: torch.Size([1, 28897]) Labels shape: torch.Size([1, 28897]) Final batch size: 1, sequence length: 25880 Attention mask shape: torch.Size([1, 1, 25880, 25880]) Position ids shape: torch.Size([1, 25880]) Input IDs shape: torch.Size([1, 25880]) Labels shape: torch.Size([1, 25880]) Final batch size: 1, sequence length: 33482 Attention mask shape: torch.Size([1, 1, 33482, 33482]) Position ids shape: torch.Size([1, 33482]) Input IDs shape: torch.Size([1, 33482]) Labels shape: torch.Size([1, 33482]) Final batch size: 1, sequence length: 23558 Attention mask shape: torch.Size([1, 1, 23558, 23558]) Position ids shape: torch.Size([1, 23558]) Input IDs shape: torch.Size([1, 23558]) Labels shape: torch.Size([1, 23558]) Final batch size: 1, sequence length: 9704 Attention mask shape: torch.Size([1, 1, 9704, 9704]) Position ids shape: torch.Size([1, 9704]) Input IDs shape: torch.Size([1, 9704]) Labels shape: torch.Size([1, 9704]) Final batch size: 1, sequence length: 20198 Attention mask shape: torch.Size([1, 1, 20198, 20198]) Position ids shape: torch.Size([1, 20198]) Input IDs shape: torch.Size([1, 20198]) Labels shape: torch.Size([1, 20198]) Final batch size: 1, sequence length: 20184 Attention mask shape: torch.Size([1, 1, 20184, 20184]) Position ids shape: torch.Size([1, 20184]) Input IDs shape: torch.Size([1, 20184]) Labels shape: torch.Size([1, 20184]) Final batch size: 1, sequence length: 35485 Attention mask shape: torch.Size([1, 1, 35485, 35485]) Position ids shape: torch.Size([1, 35485]) Input IDs shape: torch.Size([1, 35485]) Labels shape: torch.Size([1, 35485]) Final batch size: 1, sequence length: 30965 Attention mask shape: torch.Size([1, 1, 30965, 30965]) Position ids shape: torch.Size([1, 30965]) Input IDs shape: torch.Size([1, 30965]) Labels shape: torch.Size([1, 30965]) Final batch size: 1, sequence length: 34469 Attention mask shape: torch.Size([1, 1, 34469, 34469]) Position ids shape: torch.Size([1, 34469]) Input IDs shape: torch.Size([1, 34469]) Labels shape: torch.Size([1, 34469]) Final batch size: 1, sequence length: 18126 Attention mask shape: torch.Size([1, 1, 18126, 18126]) Position ids shape: torch.Size([1, 18126]) Input IDs shape: torch.Size([1, 18126]) Labels shape: torch.Size([1, 18126]) Final batch size: 1, sequence length: 30428 Attention mask shape: torch.Size([1, 1, 30428, 30428]) Position ids shape: torch.Size([1, 30428]) Input IDs shape: torch.Size([1, 30428]) Labels shape: torch.Size([1, 30428]) Final batch size: 1, sequence length: 29481 Attention mask shape: torch.Size([1, 1, 29481, 29481]) Position ids shape: torch.Size([1, 29481]) Input IDs shape: torch.Size([1, 29481]) Labels shape: torch.Size([1, 29481]) Final batch size: 1, sequence length: 35100 Attention mask shape: torch.Size([1, 1, 35100, 35100]) Position ids shape: torch.Size([1, 35100]) Input IDs shape: torch.Size([1, 35100]) Labels shape: torch.Size([1, 35100]) Final batch size: 1, sequence length: 17456 Attention mask shape: torch.Size([1, 1, 17456, 17456]) Position ids shape: torch.Size([1, 17456]) Input IDs shape: torch.Size([1, 17456]) Labels shape: torch.Size([1, 17456]) Final batch size: 1, sequence length: 32596 Attention mask shape: torch.Size([1, 1, 32596, 32596]) Position ids shape: torch.Size([1, 32596]) Input IDs shape: torch.Size([1, 32596]) Labels shape: torch.Size([1, 32596]) Final batch size: 1, sequence length: 28773 Attention mask shape: torch.Size([1, 1, 28773, 28773]) Position ids shape: torch.Size([1, 28773]) Input IDs shape: torch.Size([1, 28773]) Labels shape: torch.Size([1, 28773]) Final batch size: 1, sequence length: 18023 Final batch size: 1, sequence length: 19620 Attention mask shape: torch.Size([1, 1, 18023, 18023]) Position ids shape: torch.Size([1, 18023]) Input IDs shape: torch.Size([1, 18023]) Labels shape: torch.Size([1, 18023]) Attention mask shape: torch.Size([1, 1, 19620, 19620]) Position ids shape: torch.Size([1, 19620]) Input IDs shape: torch.Size([1, 19620]) Labels shape: torch.Size([1, 19620]) Final batch size: 1, sequence length: 16257 Attention mask shape: torch.Size([1, 1, 16257, 16257]) Position ids shape: torch.Size([1, 16257]) Input IDs shape: torch.Size([1, 16257]) Labels shape: torch.Size([1, 16257]) Final batch size: 1, sequence length: 35030 Attention mask shape: torch.Size([1, 1, 35030, 35030]) Position ids shape: torch.Size([1, 35030]) Input IDs shape: torch.Size([1, 35030]) Labels shape: torch.Size([1, 35030]) Final batch size: 1, sequence length: 37040 Attention mask shape: torch.Size([1, 1, 37040, 37040]) Position ids shape: torch.Size([1, 37040]) Input IDs shape: torch.Size([1, 37040]) Labels shape: torch.Size([1, 37040]) Final batch size: 1, sequence length: 29537 Attention mask shape: torch.Size([1, 1, 29537, 29537]) Position ids shape: torch.Size([1, 29537]) Input IDs shape: torch.Size([1, 29537]) Labels shape: torch.Size([1, 29537]) Final batch size: 1, sequence length: 17870 Attention mask shape: torch.Size([1, 1, 17870, 17870]) Position ids shape: torch.Size([1, 17870]) Input IDs shape: torch.Size([1, 17870]) Labels shape: torch.Size([1, 17870]) Final batch size: 1, sequence length: 18454 Attention mask shape: torch.Size([1, 1, 18454, 18454]) Position ids shape: torch.Size([1, 18454]) Input IDs shape: torch.Size([1, 18454]) Labels shape: torch.Size([1, 18454]) Final batch size: 1, sequence length: 11608 Attention mask shape: torch.Size([1, 1, 11608, 11608]) Position ids shape: torch.Size([1, 11608]) Input IDs shape: torch.Size([1, 11608]) Labels shape: torch.Size([1, 11608]) Final batch size: 1, sequence length: 30072 Attention mask shape: torch.Size([1, 1, 30072, 30072]) Position ids shape: torch.Size([1, 30072]) Input IDs shape: torch.Size([1, 30072]) Labels shape: torch.Size([1, 30072]) Final batch size: 1, sequence length: 37980 Attention mask shape: torch.Size([1, 1, 37980, 37980]) Position ids shape: torch.Size([1, 37980]) Input IDs shape: torch.Size([1, 37980]) Labels shape: torch.Size([1, 37980]) Final batch size: 1, sequence length: 21491 Attention mask shape: torch.Size([1, 1, 21491, 21491]) Position ids shape: torch.Size([1, 21491]) Input IDs shape: torch.Size([1, 21491]) Labels shape: torch.Size([1, 21491]) Final batch size: 1, sequence length: 24121 Attention mask shape: torch.Size([1, 1, 24121, 24121]) Position ids shape: torch.Size([1, 24121]) Input IDs shape: torch.Size([1, 24121]) Labels shape: torch.Size([1, 24121]) Final batch size: 1, sequence length: 27702 Attention mask shape: torch.Size([1, 1, 27702, 27702]) Position ids shape: torch.Size([1, 27702]) Input IDs shape: torch.Size([1, 27702]) Labels shape: torch.Size([1, 27702]) Final batch size: 1, sequence length: 13622 Attention mask shape: torch.Size([1, 1, 13622, 13622]) Position ids shape: torch.Size([1, 13622]) Input IDs shape: torch.Size([1, 13622]) Labels shape: torch.Size([1, 13622]) Final batch size: 1, sequence length: 30789 Attention mask shape: torch.Size([1, 1, 30789, 30789]) Position ids shape: torch.Size([1, 30789]) Input IDs shape: torch.Size([1, 30789]) Labels shape: torch.Size([1, 30789]) Final batch size: 1, sequence length: 21250 Attention mask shape: torch.Size([1, 1, 21250, 21250]) Position ids shape: torch.Size([1, 21250]) Input IDs shape: torch.Size([1, 21250]) Labels shape: torch.Size([1, 21250]) Final batch size: 1, sequence length: 30859 Attention mask shape: torch.Size([1, 1, 30859, 30859]) Position ids shape: torch.Size([1, 30859]) Input IDs shape: torch.Size([1, 30859]) Labels shape: torch.Size([1, 30859]) Final batch size: 1, sequence length: 19538 Attention mask shape: torch.Size([1, 1, 19538, 19538]) Position ids shape: torch.Size([1, 19538]) Input IDs shape: torch.Size([1, 19538]) Labels shape: torch.Size([1, 19538]) Final batch size: 1, sequence length: 26635 Attention mask shape: torch.Size([1, 1, 26635, 26635]) Position ids shape: torch.Size([1, 26635]) Input IDs shape: torch.Size([1, 26635]) Labels shape: torch.Size([1, 26635]) Final batch size: 1, sequence length: 29875 Attention mask shape: torch.Size([1, 1, 29875, 29875]) Position ids shape: torch.Size([1, 29875]) Input IDs shape: torch.Size([1, 29875]) Labels shape: torch.Size([1, 29875]) Final batch size: 1, sequence length: 17811 Attention mask shape: torch.Size([1, 1, 17811, 17811]) Position ids shape: torch.Size([1, 17811]) Input IDs shape: torch.Size([1, 17811]) Labels shape: torch.Size([1, 17811]) Final batch size: 1, sequence length: 19702 Attention mask shape: torch.Size([1, 1, 19702, 19702]) Position ids shape: torch.Size([1, 19702]) Input IDs shape: torch.Size([1, 19702]) Labels shape: torch.Size([1, 19702]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 26708 Attention mask shape: torch.Size([1, 1, 26708, 26708]) Position ids shape: torch.Size([1, 26708]) Input IDs shape: torch.Size([1, 26708]) Labels shape: torch.Size([1, 26708]) Final batch size: 1, sequence length: 11403 Attention mask shape: torch.Size([1, 1, 11403, 11403]) Position ids shape: torch.Size([1, 11403]) Input IDs shape: torch.Size([1, 11403]) Labels shape: torch.Size([1, 11403]) Final batch size: 1, sequence length: 37992 Attention mask shape: torch.Size([1, 1, 37992, 37992]) Position ids shape: torch.Size([1, 37992]) Input IDs shape: torch.Size([1, 37992]) Labels shape: torch.Size([1, 37992]) Final batch size: 1, sequence length: 30772 Attention mask shape: torch.Size([1, 1, 30772, 30772]) Position ids shape: torch.Size([1, 30772]) Input IDs shape: torch.Size([1, 30772]) Labels shape: torch.Size([1, 30772]) Final batch size: 1, sequence length: 15830 Attention mask shape: torch.Size([1, 1, 15830, 15830]) Position ids shape: torch.Size([1, 15830]) Input IDs shape: torch.Size([1, 15830]) Labels shape: torch.Size([1, 15830]) Final batch size: 1, sequence length: 22309 Attention mask shape: torch.Size([1, 1, 22309, 22309]) Position ids shape: torch.Size([1, 22309]) Input IDs shape: torch.Size([1, 22309]) Labels shape: torch.Size([1, 22309]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40526 Attention mask shape: torch.Size([1, 1, 40526, 40526]) Position ids shape: torch.Size([1, 40526]) Input IDs shape: torch.Size([1, 40526]) Labels shape: torch.Size([1, 40526]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 36580 Attention mask shape: torch.Size([1, 1, 36580, 36580]) Position ids shape: torch.Size([1, 36580]) Input IDs shape: torch.Size([1, 36580]) Labels shape: torch.Size([1, 36580]) Final batch size: 1, sequence length: 28814 Attention mask shape: torch.Size([1, 1, 28814, 28814]) Position ids shape: torch.Size([1, 28814]) Input IDs shape: torch.Size([1, 28814]) Labels shape: torch.Size([1, 28814]) Final batch size: 1, sequence length: 21936 Attention mask shape: torch.Size([1, 1, 21936, 21936]) Position ids shape: torch.Size([1, 21936]) Input IDs shape: torch.Size([1, 21936]) Labels shape: torch.Size([1, 21936]) Final batch size: 1, sequence length: 31745 Attention mask shape: torch.Size([1, 1, 31745, 31745]) Position ids shape: torch.Size([1, 31745]) Input IDs shape: torch.Size([1, 31745]) Labels shape: torch.Size([1, 31745]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 26215 Attention mask shape: torch.Size([1, 1, 26215, 26215]) Position ids shape: torch.Size([1, 26215]) Input IDs shape: torch.Size([1, 26215]) Labels shape: torch.Size([1, 26215]) Final batch size: 1, sequence length: 14730 Attention mask shape: torch.Size([1, 1, 14730, 14730]) Position ids shape: torch.Size([1, 14730]) Input IDs shape: torch.Size([1, 14730]) Labels shape: torch.Size([1, 14730]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 15993 Attention mask shape: torch.Size([1, 1, 15993, 15993]) Position ids shape: torch.Size([1, 15993]) Input IDs shape: torch.Size([1, 15993]) Labels shape: torch.Size([1, 15993]) Final batch size: 1, sequence length: 18606 Attention mask shape: torch.Size([1, 1, 18606, 18606]) Position ids shape: torch.Size([1, 18606]) Input IDs shape: torch.Size([1, 18606]) Labels shape: torch.Size([1, 18606]) Final batch size: 1, sequence length: 34684 Attention mask shape: torch.Size([1, 1, 34684, 34684]) Position ids shape: torch.Size([1, 34684]) Input IDs shape: torch.Size([1, 34684]) Labels shape: torch.Size([1, 34684]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 29464 Attention mask shape: torch.Size([1, 1, 29464, 29464]) Position ids shape: torch.Size([1, 29464]) Input IDs shape: torch.Size([1, 29464]) Labels shape: torch.Size([1, 29464]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 33685 Attention mask shape: torch.Size([1, 1, 33685, 33685]) Position ids shape: torch.Size([1, 33685]) Input IDs shape: torch.Size([1, 33685]) Labels shape: torch.Size([1, 33685]) Final batch size: 1, sequence length: 29639 Attention mask shape: torch.Size([1, 1, 29639, 29639]) Position ids shape: torch.Size([1, 29639]) Input IDs shape: torch.Size([1, 29639]) Labels shape: torch.Size([1, 29639]) Final batch size: 1, sequence length: 39589 Attention mask shape: torch.Size([1, 1, 39589, 39589]) Position ids shape: torch.Size([1, 39589]) Input IDs shape: torch.Size([1, 39589]) Labels shape: torch.Size([1, 39589]) Final batch size: 1, sequence length: 35077 Attention mask shape: torch.Size([1, 1, 35077, 35077]) Position ids shape: torch.Size([1, 35077]) Input IDs shape: torch.Size([1, 35077]) Labels shape: torch.Size([1, 35077]) Final batch size: 1, sequence length: 38927 Attention mask shape: torch.Size([1, 1, 38927, 38927]) Position ids shape: torch.Size([1, 38927]) Input IDs shape: torch.Size([1, 38927]) Labels shape: torch.Size([1, 38927]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 36716 Attention mask shape: torch.Size([1, 1, 36716, 36716]) Position ids shape: torch.Size([1, 36716]) Input IDs shape: torch.Size([1, 36716]) Labels shape: torch.Size([1, 36716]) Final batch size: 1, sequence length: 22945 Attention mask shape: torch.Size([1, 1, 22945, 22945]) Position ids shape: torch.Size([1, 22945]) Input IDs shape: torch.Size([1, 22945]) Labels shape: torch.Size([1, 22945]) Final batch size: 1, sequence length: 17778 Attention mask shape: torch.Size([1, 1, 17778, 17778]) Position ids shape: torch.Size([1, 17778]) Input IDs shape: torch.Size([1, 17778]) Labels shape: torch.Size([1, 17778]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 24232 Attention mask shape: torch.Size([1, 1, 24232, 24232]) Position ids shape: torch.Size([1, 24232]) Input IDs shape: torch.Size([1, 24232]) Labels shape: torch.Size([1, 24232]) Final batch size: 1, sequence length: 30135 Attention mask shape: torch.Size([1, 1, 30135, 30135]) Position ids shape: torch.Size([1, 30135]) Input IDs shape: torch.Size([1, 30135]) Labels shape: torch.Size([1, 30135]) Final batch size: 1, sequence length: 19028 Attention mask shape: torch.Size([1, 1, 19028, 19028]) Position ids shape: torch.Size([1, 19028]) Input IDs shape: torch.Size([1, 19028]) Labels shape: torch.Size([1, 19028]) Final batch size: 1, sequence length: 26930 Attention mask shape: torch.Size([1, 1, 26930, 26930]) Position ids shape: torch.Size([1, 26930]) Input IDs shape: torch.Size([1, 26930]) Labels shape: torch.Size([1, 26930]) Final batch size: 1, sequence length: 39661 Attention mask shape: torch.Size([1, 1, 39661, 39661]) Position ids shape: torch.Size([1, 39661]) Input IDs shape: torch.Size([1, 39661]) Labels shape: torch.Size([1, 39661]) Final batch size: 1, sequence length: 20057 Attention mask shape: torch.Size([1, 1, 20057, 20057]) Position ids shape: torch.Size([1, 20057]) Input IDs shape: torch.Size([1, 20057]) Labels shape: torch.Size([1, 20057]) Final batch size: 1, sequence length: 16270 Attention mask shape: torch.Size([1, 1, 16270, 16270]) Position ids shape: torch.Size([1, 16270]) Input IDs shape: torch.Size([1, 16270]) Labels shape: torch.Size([1, 16270]) Final batch size: 1, sequence length: 35952 Attention mask shape: torch.Size([1, 1, 35952, 35952]) Position ids shape: torch.Size([1, 35952]) Input IDs shape: torch.Size([1, 35952]) Labels shape: torch.Size([1, 35952]) Final batch size: 1, sequence length: 17623 Attention mask shape: torch.Size([1, 1, 17623, 17623]) Position ids shape: torch.Size([1, 17623]) Input IDs shape: torch.Size([1, 17623]) Labels shape: torch.Size([1, 17623]) Final batch size: 1, sequence length: 28810 Attention mask shape: torch.Size([1, 1, 28810, 28810]) Position ids shape: torch.Size([1, 28810]) Input IDs shape: torch.Size([1, 28810]) Labels shape: torch.Size([1, 28810]) Final batch size: 1, sequence length: 37866 Attention mask shape: torch.Size([1, 1, 37866, 37866]) Position ids shape: torch.Size([1, 37866]) Input IDs shape: torch.Size([1, 37866]) Labels shape: torch.Size([1, 37866]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 31381 Attention mask shape: torch.Size([1, 1, 31381, 31381]) Position ids shape: torch.Size([1, 31381]) Input IDs shape: torch.Size([1, 31381]) Labels shape: torch.Size([1, 31381]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 20765 Attention mask shape: torch.Size([1, 1, 20765, 20765]) Position ids shape: torch.Size([1, 20765]) Input IDs shape: torch.Size([1, 20765]) Labels shape: torch.Size([1, 20765]) Final batch size: 1, sequence length: 17971 Attention mask shape: torch.Size([1, 1, 17971, 17971]) Position ids shape: torch.Size([1, 17971]) Input IDs shape: torch.Size([1, 17971]) Labels shape: torch.Size([1, 17971]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 27281 Attention mask shape: torch.Size([1, 1, 27281, 27281]) Position ids shape: torch.Size([1, 27281]) Input IDs shape: torch.Size([1, 27281]) Labels shape: torch.Size([1, 27281]) Final batch size: 1, sequence length: 37407 Attention mask shape: torch.Size([1, 1, 37407, 37407]) Position ids shape: torch.Size([1, 37407]) Input IDs shape: torch.Size([1, 37407]) Labels shape: torch.Size([1, 37407]) Final batch size: 1, sequence length: 31420 Attention mask shape: torch.Size([1, 1, 31420, 31420]) Position ids shape: torch.Size([1, 31420]) Input IDs shape: torch.Size([1, 31420]) Labels shape: torch.Size([1, 31420]) Final batch size: 1, sequence length: 13903 Attention mask shape: torch.Size([1, 1, 13903, 13903]) Position ids shape: torch.Size([1, 13903]) Input IDs shape: torch.Size([1, 13903]) Labels shape: torch.Size([1, 13903]) Final batch size: 1, sequence length: 21071 Attention mask shape: torch.Size([1, 1, 21071, 21071]) Position ids shape: torch.Size([1, 21071]) Input IDs shape: torch.Size([1, 21071]) Labels shape: torch.Size([1, 21071]) Final batch size: 1, sequence length: 25032 Attention mask shape: torch.Size([1, 1, 25032, 25032]) Position ids shape: torch.Size([1, 25032]) Input IDs shape: torch.Size([1, 25032]) Labels shape: torch.Size([1, 25032]) Final batch size: 1, sequence length: 21006 Attention mask shape: torch.Size([1, 1, 21006, 21006]) Position ids shape: torch.Size([1, 21006]) Input IDs shape: torch.Size([1, 21006]) Labels shape: torch.Size([1, 21006]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 37397 Attention mask shape: torch.Size([1, 1, 37397, 37397]) Position ids shape: torch.Size([1, 37397]) Input IDs shape: torch.Size([1, 37397]) Labels shape: torch.Size([1, 37397]) Final batch size: 1, sequence length: 14104 Attention mask shape: torch.Size([1, 1, 14104, 14104]) Position ids shape: torch.Size([1, 14104]) Input IDs shape: torch.Size([1, 14104]) Labels shape: torch.Size([1, 14104]) Final batch size: 1, sequence length: 25388 Attention mask shape: torch.Size([1, 1, 25388, 25388]) Position ids shape: torch.Size([1, 25388]) Input IDs shape: torch.Size([1, 25388]) Labels shape: torch.Size([1, 25388]) Final batch size: 1, sequence length: 21805 Attention mask shape: torch.Size([1, 1, 21805, 21805]) Position ids shape: torch.Size([1, 21805]) Input IDs shape: torch.Size([1, 21805]) Labels shape: torch.Size([1, 21805]) Final batch size: 1, sequence length: 26665 Attention mask shape: torch.Size([1, 1, 26665, 26665]) Position ids shape: torch.Size([1, 26665]) Input IDs shape: torch.Size([1, 26665]) Labels shape: torch.Size([1, 26665]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 35879 Attention mask shape: torch.Size([1, 1, 35879, 35879]) Position ids shape: torch.Size([1, 35879]) Input IDs shape: torch.Size([1, 35879]) Labels shape: torch.Size([1, 35879]) Final batch size: 1, sequence length: 40579 Attention mask shape: torch.Size([1, 1, 40579, 40579]) Position ids shape: torch.Size([1, 40579]) Input IDs shape: torch.Size([1, 40579]) Labels shape: torch.Size([1, 40579]) Final batch size: 1, sequence length: 18566 Attention mask shape: torch.Size([1, 1, 18566, 18566]) Position ids shape: torch.Size([1, 18566]) Input IDs shape: torch.Size([1, 18566]) Labels shape: torch.Size([1, 18566]) Final batch size: 1, sequence length: 36415 Attention mask shape: torch.Size([1, 1, 36415, 36415]) Position ids shape: torch.Size([1, 36415]) Input IDs shape: torch.Size([1, 36415]) Labels shape: torch.Size([1, 36415]) Final batch size: 1, sequence length: 23187 Attention mask shape: torch.Size([1, 1, 23187, 23187]) Position ids shape: torch.Size([1, 23187]) Input IDs shape: torch.Size([1, 23187]) Labels shape: torch.Size([1, 23187]) Final batch size: 1, sequence length: 39587 Attention mask shape: torch.Size([1, 1, 39587, 39587]) Position ids shape: torch.Size([1, 39587]) Input IDs shape: torch.Size([1, 39587]) Labels shape: torch.Size([1, 39587]) Final batch size: 1, sequence length: 32024 Attention mask shape: torch.Size([1, 1, 32024, 32024]) Position ids shape: torch.Size([1, 32024]) Input IDs shape: torch.Size([1, 32024]) Labels shape: torch.Size([1, 32024]) Final batch size: 1, sequence length: 12200 Attention mask shape: torch.Size([1, 1, 12200, 12200]) Position ids shape: torch.Size([1, 12200]) Input IDs shape: torch.Size([1, 12200]) Labels shape: torch.Size([1, 12200]) Final batch size: 1, sequence length: 39778 Attention mask shape: torch.Size([1, 1, 39778, 39778]) Position ids shape: torch.Size([1, 39778]) Input IDs shape: torch.Size([1, 39778]) Labels shape: torch.Size([1, 39778]) Final batch size: 1, sequence length: 37530 Attention mask shape: torch.Size([1, 1, 37530, 37530]) Position ids shape: torch.Size([1, 37530]) Input IDs shape: torch.Size([1, 37530]) Labels shape: torch.Size([1, 37530]) Final batch size: 1, sequence length: 32541 Attention mask shape: torch.Size([1, 1, 32541, 32541]) Position ids shape: torch.Size([1, 32541]) Input IDs shape: torch.Size([1, 32541]) Labels shape: torch.Size([1, 32541]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 19668 Attention mask shape: torch.Size([1, 1, 19668, 19668]) Position ids shape: torch.Size([1, 19668]) Input IDs shape: torch.Size([1, 19668]) Labels shape: torch.Size([1, 19668]) Final batch size: 1, sequence length: 39579 Attention mask shape: torch.Size([1, 1, 39579, 39579]) Position ids shape: torch.Size([1, 39579]) Input IDs shape: torch.Size([1, 39579]) Labels shape: torch.Size([1, 39579]) Final batch size: 1, sequence length: 37890 Attention mask shape: torch.Size([1, 1, 37890, 37890]) Position ids shape: torch.Size([1, 37890]) Input IDs shape: torch.Size([1, 37890]) Labels shape: torch.Size([1, 37890]) Final batch size: 1, sequence length: 25383 Attention mask shape: torch.Size([1, 1, 25383, 25383]) Position ids shape: torch.Size([1, 25383]) Input IDs shape: torch.Size([1, 25383]) Labels shape: torch.Size([1, 25383]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 36647 Attention mask shape: torch.Size([1, 1, 36647, 36647]) Position ids shape: torch.Size([1, 36647]) Input IDs shape: torch.Size([1, 36647]) Labels shape: torch.Size([1, 36647]) Final batch size: 1, sequence length: 9251 Attention mask shape: torch.Size([1, 1, 9251, 9251]) Position ids shape: torch.Size([1, 9251]) Input IDs shape: torch.Size([1, 9251]) Labels shape: torch.Size([1, 9251]) Final batch size: 1, sequence length: 14308 Attention mask shape: torch.Size([1, 1, 14308, 14308]) Position ids shape: torch.Size([1, 14308]) Input IDs shape: torch.Size([1, 14308]) Labels shape: torch.Size([1, 14308]) {'loss': 0.252, 'grad_norm': 0.2763124136389419, 'learning_rate': 5.522642316338268e-06, 'num_tokens': -inf, 'epoch': 4.12} Final batch size: 1, sequence length: 4858 Attention mask shape: torch.Size([1, 1, 4858, 4858]) Position ids shape: torch.Size([1, 4858]) Input IDs shape: torch.Size([1, 4858]) Labels shape: torch.Size([1, 4858]) Final batch size: 1, sequence length: 6316 Attention mask shape: torch.Size([1, 1, 6316, 6316]) Position ids shape: torch.Size([1, 6316]) Input IDs shape: torch.Size([1, 6316]) Labels shape: torch.Size([1, 6316]) Final batch size: 1, sequence length: 7360 Attention mask shape: torch.Size([1, 1, 7360, 7360]) Position ids shape: torch.Size([1, 7360]) Input IDs shape: torch.Size([1, 7360]) Labels shape: torch.Size([1, 7360]) Final batch size: 1, sequence length: 11448 Attention mask shape: torch.Size([1, 1, 11448, 11448]) Position ids shape: torch.Size([1, 11448]) Input IDs shape: torch.Size([1, 11448]) Labels shape: torch.Size([1, 11448]) Final batch size: 1, sequence length: 12075 Attention mask shape: torch.Size([1, 1, 12075, 12075]) Position ids shape: torch.Size([1, 12075]) Input IDs shape: torch.Size([1, 12075]) Labels shape: torch.Size([1, 12075]) Final batch size: 1, sequence length: 12846 Attention mask shape: torch.Size([1, 1, 12846, 12846]) Position ids shape: torch.Size([1, 12846]) Input IDs shape: torch.Size([1, 12846]) Labels shape: torch.Size([1, 12846]) Final batch size: 1, sequence length: 12945 Attention mask shape: torch.Size([1, 1, 12945, 12945]) Position ids shape: torch.Size([1, 12945]) Input IDs shape: torch.Size([1, 12945]) Labels shape: torch.Size([1, 12945]) Final batch size: 1, sequence length: 14330 Attention mask shape: torch.Size([1, 1, 14330, 14330]) Position ids shape: torch.Size([1, 14330]) Input IDs shape: torch.Size([1, 14330]) Labels shape: torch.Size([1, 14330]) Final batch size: 1, sequence length: 14891 Attention mask shape: torch.Size([1, 1, 14891, 14891]) Position ids shape: torch.Size([1, 14891]) Input IDs shape: torch.Size([1, 14891]) Labels shape: torch.Size([1, 14891]) Final batch size: 1, sequence length: 15189 Attention mask shape: torch.Size([1, 1, 15189, 15189]) Position ids shape: torch.Size([1, 15189]) Input IDs shape: torch.Size([1, 15189]) Labels shape: torch.Size([1, 15189]) Final batch size: 1, sequence length: 16961 Attention mask shape: torch.Size([1, 1, 16961, 16961]) Position ids shape: torch.Size([1, 16961]) Input IDs shape: torch.Size([1, 16961]) Labels shape: torch.Size([1, 16961]) Final batch size: 1, sequence length: 16658 Attention mask shape: torch.Size([1, 1, 16658, 16658]) Position ids shape: torch.Size([1, 16658]) Input IDs shape: torch.Size([1, 16658]) Labels shape: torch.Size([1, 16658]) Final batch size: 1, sequence length: 17246 Attention mask shape: torch.Size([1, 1, 17246, 17246]) Position ids shape: torch.Size([1, 17246]) Input IDs shape: torch.Size([1, 17246]) Labels shape: torch.Size([1, 17246]) Final batch size: 1, sequence length: 18341 Attention mask shape: torch.Size([1, 1, 18341, 18341]) Position ids shape: torch.Size([1, 18341]) Input IDs shape: torch.Size([1, 18341]) Labels shape: torch.Size([1, 18341]) Final batch size: 1, sequence length: 16536 Attention mask shape: torch.Size([1, 1, 16536, 16536]) Position ids shape: torch.Size([1, 16536]) Input IDs shape: torch.Size([1, 16536]) Labels shape: torch.Size([1, 16536]) Final batch size: 1, sequence length: 19332 Attention mask shape: torch.Size([1, 1, 19332, 19332]) Position ids shape: torch.Size([1, 19332]) Input IDs shape: torch.Size([1, 19332]) Labels shape: torch.Size([1, 19332]) Final batch size: 1, sequence length: 16391 Attention mask shape: torch.Size([1, 1, 16391, 16391]) Position ids shape: torch.Size([1, 16391]) Input IDs shape: torch.Size([1, 16391]) Labels shape: torch.Size([1, 16391]) Final batch size: 1, sequence length: 19999 Attention mask shape: torch.Size([1, 1, 19999, 19999]) Position ids shape: torch.Size([1, 19999]) Input IDs shape: torch.Size([1, 19999]) Labels shape: torch.Size([1, 19999]) Final batch size: 1, sequence length: 16326 Attention mask shape: torch.Size([1, 1, 16326, 16326]) Position ids shape: torch.Size([1, 16326]) Input IDs shape: torch.Size([1, 16326]) Labels shape: torch.Size([1, 16326]) Final batch size: 1, sequence length: 21617 Attention mask shape: torch.Size([1, 1, 21617, 21617]) Position ids shape: torch.Size([1, 21617]) Input IDs shape: torch.Size([1, 21617]) Labels shape: torch.Size([1, 21617]) Final batch size: 1, sequence length: 18424 Attention mask shape: torch.Size([1, 1, 18424, 18424]) Position ids shape: torch.Size([1, 18424]) Input IDs shape: torch.Size([1, 18424]) Labels shape: torch.Size([1, 18424]) Final batch size: 1, sequence length: 18014 Attention mask shape: torch.Size([1, 1, 18014, 18014]) Position ids shape: torch.Size([1, 18014]) Input IDs shape: torch.Size([1, 18014]) Labels shape: torch.Size([1, 18014]) Final batch size: 1, sequence length: 21122 Attention mask shape: torch.Size([1, 1, 21122, 21122]) Position ids shape: torch.Size([1, 21122]) Input IDs shape: torch.Size([1, 21122]) Labels shape: torch.Size([1, 21122]) Final batch size: 1, sequence length: 18400 Attention mask shape: torch.Size([1, 1, 18400, 18400]) Position ids shape: torch.Size([1, 18400]) Input IDs shape: torch.Size([1, 18400]) Labels shape: torch.Size([1, 18400]) Final batch size: 1, sequence length: 20888 Attention mask shape: torch.Size([1, 1, 20888, 20888]) Position ids shape: torch.Size([1, 20888]) Input IDs shape: torch.Size([1, 20888]) Labels shape: torch.Size([1, 20888]) Final batch size: 1, sequence length: 20118 Attention mask shape: torch.Size([1, 1, 20118, 20118]) Position ids shape: torch.Size([1, 20118]) Input IDs shape: torch.Size([1, 20118]) Labels shape: torch.Size([1, 20118]) Final batch size: 1, sequence length: 19597 Attention mask shape: torch.Size([1, 1, 19597, 19597]) Position ids shape: torch.Size([1, 19597]) Input IDs shape: torch.Size([1, 19597]) Labels shape: torch.Size([1, 19597]) Final batch size: 1, sequence length: 20912 Attention mask shape: torch.Size([1, 1, 20912, 20912]) Position ids shape: torch.Size([1, 20912]) Input IDs shape: torch.Size([1, 20912]) Labels shape: torch.Size([1, 20912]) Final batch size: 1, sequence length: 18393 Attention mask shape: torch.Size([1, 1, 18393, 18393]) Position ids shape: torch.Size([1, 18393]) Input IDs shape: torch.Size([1, 18393]) Labels shape: torch.Size([1, 18393]) Final batch size: 1, sequence length: 21611 Attention mask shape: torch.Size([1, 1, 21611, 21611]) Position ids shape: torch.Size([1, 21611]) Input IDs shape: torch.Size([1, 21611]) Labels shape: torch.Size([1, 21611]) Final batch size: 1, sequence length: 16831 Attention mask shape: torch.Size([1, 1, 16831, 16831]) Position ids shape: torch.Size([1, 16831]) Input IDs shape: torch.Size([1, 16831]) Labels shape: torch.Size([1, 16831]) Final batch size: 1, sequence length: 13638 Attention mask shape: torch.Size([1, 1, 13638, 13638]) Position ids shape: torch.Size([1, 13638]) Input IDs shape: torch.Size([1, 13638]) Labels shape: torch.Size([1, 13638]) Final batch size: 1, sequence length: 22854 Attention mask shape: torch.Size([1, 1, 22854, 22854]) Position ids shape: torch.Size([1, 22854]) Input IDs shape: torch.Size([1, 22854]) Labels shape: torch.Size([1, 22854]) Final batch size: 1, sequence length: 21314 Attention mask shape: torch.Size([1, 1, 21314, 21314]) Position ids shape: torch.Size([1, 21314]) Input IDs shape: torch.Size([1, 21314]) Labels shape: torch.Size([1, 21314]) Final batch size: 1, sequence length: 6839 Attention mask shape: torch.Size([1, 1, 6839, 6839]) Position ids shape: torch.Size([1, 6839]) Input IDs shape: torch.Size([1, 6839]) Labels shape: torch.Size([1, 6839]) Final batch size: 1, sequence length: 9341 Attention mask shape: torch.Size([1, 1, 9341, 9341]) Position ids shape: torch.Size([1, 9341]) Input IDs shape: torch.Size([1, 9341]) Labels shape: torch.Size([1, 9341]) Final batch size: 1, sequence length: 12328 Attention mask shape: torch.Size([1, 1, 12328, 12328]) Position ids shape: torch.Size([1, 12328]) Input IDs shape: torch.Size([1, 12328]) Labels shape: torch.Size([1, 12328]) Final batch size: 1, sequence length: 15026 Attention mask shape: torch.Size([1, 1, 15026, 15026]) Position ids shape: torch.Size([1, 15026]) Input IDs shape: torch.Size([1, 15026]) Labels shape: torch.Size([1, 15026]) Final batch size: 1, sequence length: 21739 Attention mask shape: torch.Size([1, 1, 21739, 21739]) Position ids shape: torch.Size([1, 21739]) Input IDs shape: torch.Size([1, 21739]) Labels shape: torch.Size([1, 21739]) Final batch size: 1, sequence length: 16611 Attention mask shape: torch.Size([1, 1, 16611, 16611]) Position ids shape: torch.Size([1, 16611]) Input IDs shape: torch.Size([1, 16611]) Labels shape: torch.Size([1, 16611]) Final batch size: 1, sequence length: 25042 Attention mask shape: torch.Size([1, 1, 25042, 25042]) Position ids shape: torch.Size([1, 25042]) Input IDs shape: torch.Size([1, 25042]) Labels shape: torch.Size([1, 25042]) Final batch size: 1, sequence length: 23614 Attention mask shape: torch.Size([1, 1, 23614, 23614]) Position ids shape: torch.Size([1, 23614]) Input IDs shape: torch.Size([1, 23614]) Labels shape: torch.Size([1, 23614]) Final batch size: 1, sequence length: 21725 Attention mask shape: torch.Size([1, 1, 21725, 21725]) Position ids shape: torch.Size([1, 21725]) Input IDs shape: torch.Size([1, 21725]) Labels shape: torch.Size([1, 21725]) Final batch size: 1, sequence length: 21677 Attention mask shape: torch.Size([1, 1, 21677, 21677]) Position ids shape: torch.Size([1, 21677]) Input IDs shape: torch.Size([1, 21677]) Labels shape: torch.Size([1, 21677]) Final batch size: 1, sequence length: 25909 Attention mask shape: torch.Size([1, 1, 25909, 25909]) Position ids shape: torch.Size([1, 25909]) Input IDs shape: torch.Size([1, 25909]) Labels shape: torch.Size([1, 25909]) Final batch size: 1, sequence length: 28348 Attention mask shape: torch.Size([1, 1, 28348, 28348]) Position ids shape: torch.Size([1, 28348]) Input IDs shape: torch.Size([1, 28348]) Labels shape: torch.Size([1, 28348]) Final batch size: 1, sequence length: 26218 Attention mask shape: torch.Size([1, 1, 26218, 26218]) Position ids shape: torch.Size([1, 26218]) Input IDs shape: torch.Size([1, 26218]) Labels shape: torch.Size([1, 26218]) Final batch size: 1, sequence length: 13646 Attention mask shape: torch.Size([1, 1, 13646, 13646]) Position ids shape: torch.Size([1, 13646]) Input IDs shape: torch.Size([1, 13646]) Labels shape: torch.Size([1, 13646]) Final batch size: 1, sequence length: 29730 Attention mask shape: torch.Size([1, 1, 29730, 29730]) Position ids shape: torch.Size([1, 29730]) Input IDs shape: torch.Size([1, 29730]) Labels shape: torch.Size([1, 29730]) Final batch size: 1, sequence length: 15232 Attention mask shape: torch.Size([1, 1, 15232, 15232]) Position ids shape: torch.Size([1, 15232]) Input IDs shape: torch.Size([1, 15232]) Labels shape: torch.Size([1, 15232]) Final batch size: 1, sequence length: 25575 Attention mask shape: torch.Size([1, 1, 25575, 25575]) Position ids shape: torch.Size([1, 25575]) Input IDs shape: torch.Size([1, 25575]) Labels shape: torch.Size([1, 25575]) Final batch size: 1, sequence length: 19885 Attention mask shape: torch.Size([1, 1, 19885, 19885]) Position ids shape: torch.Size([1, 19885]) Input IDs shape: torch.Size([1, 19885]) Labels shape: torch.Size([1, 19885]) Final batch size: 1, sequence length: 24342 Attention mask shape: torch.Size([1, 1, 24342, 24342]) Position ids shape: torch.Size([1, 24342]) Input IDs shape: torch.Size([1, 24342]) Labels shape: torch.Size([1, 24342]) Final batch size: 1, sequence length: 22217 Attention mask shape: torch.Size([1, 1, 22217, 22217]) Position ids shape: torch.Size([1, 22217]) Input IDs shape: torch.Size([1, 22217]) Labels shape: torch.Size([1, 22217]) Final batch size: 1, sequence length: 20619 Attention mask shape: torch.Size([1, 1, 20619, 20619]) Position ids shape: torch.Size([1, 20619]) Input IDs shape: torch.Size([1, 20619]) Labels shape: torch.Size([1, 20619]) Final batch size: 1, sequence length: 27795 Attention mask shape: torch.Size([1, 1, 27795, 27795]) Position ids shape: torch.Size([1, 27795]) Input IDs shape: torch.Size([1, 27795]) Labels shape: torch.Size([1, 27795]) Final batch size: 1, sequence length: 24597 Attention mask shape: torch.Size([1, 1, 24597, 24597]) Position ids shape: torch.Size([1, 24597]) Input IDs shape: torch.Size([1, 24597]) Labels shape: torch.Size([1, 24597]) Final batch size: 1, sequence length: 30410 Attention mask shape: torch.Size([1, 1, 30410, 30410]) Position ids shape: torch.Size([1, 30410]) Input IDs shape: torch.Size([1, 30410]) Labels shape: torch.Size([1, 30410]) Final batch size: 1, sequence length: 20915 Attention mask shape: torch.Size([1, 1, 20915, 20915]) Position ids shape: torch.Size([1, 20915]) Input IDs shape: torch.Size([1, 20915]) Labels shape: torch.Size([1, 20915]) Final batch size: 1, sequence length: 21623 Attention mask shape: torch.Size([1, 1, 21623, 21623]) Position ids shape: torch.Size([1, 21623]) Input IDs shape: torch.Size([1, 21623]) Labels shape: torch.Size([1, 21623]) Final batch size: 1, sequence length: 24090 Attention mask shape: torch.Size([1, 1, 24090, 24090]) Position ids shape: torch.Size([1, 24090]) Input IDs shape: torch.Size([1, 24090]) Labels shape: torch.Size([1, 24090]) Final batch size: 1, sequence length: 28678 Attention mask shape: torch.Size([1, 1, 28678, 28678]) Position ids shape: torch.Size([1, 28678]) Input IDs shape: torch.Size([1, 28678]) Labels shape: torch.Size([1, 28678]) Final batch size: 1, sequence length: 30917 Attention mask shape: torch.Size([1, 1, 30917, 30917]) Position ids shape: torch.Size([1, 30917]) Input IDs shape: torch.Size([1, 30917]) Labels shape: torch.Size([1, 30917]) Final batch size: 1, sequence length: 17780 Attention mask shape: torch.Size([1, 1, 17780, 17780]) Position ids shape: torch.Size([1, 17780]) Input IDs shape: torch.Size([1, 17780]) Labels shape: torch.Size([1, 17780]) Final batch size: 1, sequence length: 20702 Attention mask shape: torch.Size([1, 1, 20702, 20702]) Position ids shape: torch.Size([1, 20702]) Input IDs shape: torch.Size([1, 20702]) Labels shape: torch.Size([1, 20702]) Final batch size: 1, sequence length: 32034 Attention mask shape: torch.Size([1, 1, 32034, 32034]) Position ids shape: torch.Size([1, 32034]) Input IDs shape: torch.Size([1, 32034]) Labels shape: torch.Size([1, 32034]) Final batch size: 1, sequence length: 30766 Attention mask shape: torch.Size([1, 1, 30766, 30766]) Position ids shape: torch.Size([1, 30766]) Input IDs shape: torch.Size([1, 30766]) Labels shape: torch.Size([1, 30766]) Final batch size: 1, sequence length: 16403 Attention mask shape: torch.Size([1, 1, 16403, 16403]) Position ids shape: torch.Size([1, 16403]) Input IDs shape: torch.Size([1, 16403]) Labels shape: torch.Size([1, 16403]) Final batch size: 1, sequence length: 25698 Attention mask shape: torch.Size([1, 1, 25698, 25698]) Position ids shape: torch.Size([1, 25698]) Input IDs shape: torch.Size([1, 25698]) Labels shape: torch.Size([1, 25698]) Final batch size: 1, sequence length: 35781 Attention mask shape: torch.Size([1, 1, 35781, 35781]) Position ids shape: torch.Size([1, 35781]) Input IDs shape: torch.Size([1, 35781]) Labels shape: torch.Size([1, 35781]) Final batch size: 1, sequence length: 35628 Attention mask shape: torch.Size([1, 1, 35628, 35628]) Position ids shape: torch.Size([1, 35628]) Input IDs shape: torch.Size([1, 35628]) Labels shape: torch.Size([1, 35628]) Final batch size: 1, sequence length: 30366 Attention mask shape: torch.Size([1, 1, 30366, 30366]) Position ids shape: torch.Size([1, 30366]) Input IDs shape: torch.Size([1, 30366]) Labels shape: torch.Size([1, 30366]) Final batch size: 1, sequence length: 23626 Attention mask shape: torch.Size([1, 1, 23626, 23626]) Position ids shape: torch.Size([1, 23626]) Input IDs shape: torch.Size([1, 23626]) Labels shape: torch.Size([1, 23626]) Final batch size: 1, sequence length: 14070 Attention mask shape: torch.Size([1, 1, 14070, 14070]) Position ids shape: torch.Size([1, 14070]) Input IDs shape: torch.Size([1, 14070]) Labels shape: torch.Size([1, 14070]) Final batch size: 1, sequence length: 21562 Attention mask shape: torch.Size([1, 1, 21562, 21562]) Position ids shape: torch.Size([1, 21562]) Input IDs shape: torch.Size([1, 21562]) Labels shape: torch.Size([1, 21562]) Final batch size: 1, sequence length: 37195 Attention mask shape: torch.Size([1, 1, 37195, 37195]) Position ids shape: torch.Size([1, 37195]) Input IDs shape: torch.Size([1, 37195]) Labels shape: torch.Size([1, 37195]) Final batch size: 1, sequence length: 38190 Attention mask shape: torch.Size([1, 1, 38190, 38190]) Position ids shape: torch.Size([1, 38190]) Input IDs shape: torch.Size([1, 38190]) Labels shape: torch.Size([1, 38190]) Final batch size: 1, sequence length: 35697 Attention mask shape: torch.Size([1, 1, 35697, 35697]) Position ids shape: torch.Size([1, 35697]) Input IDs shape: torch.Size([1, 35697]) Labels shape: torch.Size([1, 35697]) Final batch size: 1, sequence length: 20028 Attention mask shape: torch.Size([1, 1, 20028, 20028]) Position ids shape: torch.Size([1, 20028]) Input IDs shape: torch.Size([1, 20028]) Labels shape: torch.Size([1, 20028]) Final batch size: 1, sequence length: 32630 Attention mask shape: torch.Size([1, 1, 32630, 32630]) Position ids shape: torch.Size([1, 32630]) Input IDs shape: torch.Size([1, 32630]) Labels shape: torch.Size([1, 32630]) Final batch size: 1, sequence length: 32239 Attention mask shape: torch.Size([1, 1, 32239, 32239]) Position ids shape: torch.Size([1, 32239]) Input IDs shape: torch.Size([1, 32239]) Labels shape: torch.Size([1, 32239]) Final batch size: 1, sequence length: 38420 Attention mask shape: torch.Size([1, 1, 38420, 38420]) Position ids shape: torch.Size([1, 38420]) Input IDs shape: torch.Size([1, 38420]) Labels shape: torch.Size([1, 38420]) Final batch size: 1, sequence length: 22768 Attention mask shape: torch.Size([1, 1, 22768, 22768]) Position ids shape: torch.Size([1, 22768]) Input IDs shape: torch.Size([1, 22768]) Labels shape: torch.Size([1, 22768]) Final batch size: 1, sequence length: 39608 Attention mask shape: torch.Size([1, 1, 39608, 39608]) Position ids shape: torch.Size([1, 39608]) Input IDs shape: torch.Size([1, 39608]) Labels shape: torch.Size([1, 39608]) Final batch size: 1, sequence length: 11896 Attention mask shape: torch.Size([1, 1, 11896, 11896]) Position ids shape: torch.Size([1, 11896]) Input IDs shape: torch.Size([1, 11896]) Labels shape: torch.Size([1, 11896]) Final batch size: 1, sequence length: 34802 Attention mask shape: torch.Size([1, 1, 34802, 34802]) Position ids shape: torch.Size([1, 34802]) Input IDs shape: torch.Size([1, 34802]) Labels shape: torch.Size([1, 34802]) Final batch size: 1, sequence length: 38362 Attention mask shape: torch.Size([1, 1, 38362, 38362]) Position ids shape: torch.Size([1, 38362]) Input IDs shape: torch.Size([1, 38362]) Labels shape: torch.Size([1, 38362]) Final batch size: 1, sequence length: 30796 Attention mask shape: torch.Size([1, 1, 30796, 30796]) Position ids shape: torch.Size([1, 30796]) Input IDs shape: torch.Size([1, 30796]) Labels shape: torch.Size([1, 30796]) Final batch size: 1, sequence length: 18568 Attention mask shape: torch.Size([1, 1, 18568, 18568]) Position ids shape: torch.Size([1, 18568]) Input IDs shape: torch.Size([1, 18568]) Labels shape: torch.Size([1, 18568]) Final batch size: 1, sequence length: 14828 Attention mask shape: torch.Size([1, 1, 14828, 14828]) Position ids shape: torch.Size([1, 14828]) Input IDs shape: torch.Size([1, 14828]) Labels shape: torch.Size([1, 14828]) Final batch size: 1, sequence length: 15726 Attention mask shape: torch.Size([1, 1, 15726, 15726]) Position ids shape: torch.Size([1, 15726]) Input IDs shape: torch.Size([1, 15726]) Labels shape: torch.Size([1, 15726]) Final batch size: 1, sequence length: 32308 Attention mask shape: torch.Size([1, 1, 32308, 32308]) Position ids shape: torch.Size([1, 32308]) Input IDs shape: torch.Size([1, 32308]) Labels shape: torch.Size([1, 32308]) Final batch size: 1, sequence length: 22199 Attention mask shape: torch.Size([1, 1, 22199, 22199]) Position ids shape: torch.Size([1, 22199]) Input IDs shape: torch.Size([1, 22199]) Labels shape: torch.Size([1, 22199]) Final batch size: 1, sequence length: 10426 Attention mask shape: torch.Size([1, 1, 10426, 10426]) Position ids shape: torch.Size([1, 10426]) Input IDs shape: torch.Size([1, 10426]) Labels shape: torch.Size([1, 10426]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 26287 Attention mask shape: torch.Size([1, 1, 26287, 26287]) Position ids shape: torch.Size([1, 26287]) Input IDs shape: torch.Size([1, 26287]) Labels shape: torch.Size([1, 26287]) Final batch size: 1, sequence length: 29202 Attention mask shape: torch.Size([1, 1, 29202, 29202]) Position ids shape: torch.Size([1, 29202]) Input IDs shape: torch.Size([1, 29202]) Labels shape: torch.Size([1, 29202]) Final batch size: 1, sequence length: 37529 Attention mask shape: torch.Size([1, 1, 37529, 37529]) Position ids shape: torch.Size([1, 37529]) Input IDs shape: torch.Size([1, 37529]) Labels shape: torch.Size([1, 37529]) Final batch size: 1, sequence length: 30651 Attention mask shape: torch.Size([1, 1, 30651, 30651]) Position ids shape: torch.Size([1, 30651]) Input IDs shape: torch.Size([1, 30651]) Labels shape: torch.Size([1, 30651]) Final batch size: 1, sequence length: 26840 Attention mask shape: torch.Size([1, 1, 26840, 26840]) Position ids shape: torch.Size([1, 26840]) Input IDs shape: torch.Size([1, 26840]) Labels shape: torch.Size([1, 26840]) Final batch size: 1, sequence length: 16304 Attention mask shape: torch.Size([1, 1, 16304, 16304]) Position ids shape: torch.Size([1, 16304]) Input IDs shape: torch.Size([1, 16304]) Labels shape: torch.Size([1, 16304]) Final batch size: 1, sequence length: 18859 Attention mask shape: torch.Size([1, 1, 18859, 18859]) Position ids shape: torch.Size([1, 18859]) Input IDs shape: torch.Size([1, 18859]) Labels shape: torch.Size([1, 18859]) Final batch size: 1, sequence length: 34785 Attention mask shape: torch.Size([1, 1, 34785, 34785]) Position ids shape: torch.Size([1, 34785]) Input IDs shape: torch.Size([1, 34785]) Labels shape: torch.Size([1, 34785]) Final batch size: 1, sequence length: 35914 Attention mask shape: torch.Size([1, 1, 35914, 35914]) Position ids shape: torch.Size([1, 35914]) Input IDs shape: torch.Size([1, 35914]) Labels shape: torch.Size([1, 35914]) Final batch size: 1, sequence length: 40088 Attention mask shape: torch.Size([1, 1, 40088, 40088]) Position ids shape: torch.Size([1, 40088]) Input IDs shape: torch.Size([1, 40088]) Labels shape: torch.Size([1, 40088]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 10198 Attention mask shape: torch.Size([1, 1, 10198, 10198]) Position ids shape: torch.Size([1, 10198]) Input IDs shape: torch.Size([1, 10198]) Labels shape: torch.Size([1, 10198]) Final batch size: 1, sequence length: 31143 Attention mask shape: torch.Size([1, 1, 31143, 31143]) Position ids shape: torch.Size([1, 31143]) Input IDs shape: torch.Size([1, 31143]) Labels shape: torch.Size([1, 31143]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 32559 Attention mask shape: torch.Size([1, 1, 32559, 32559]) Position ids shape: torch.Size([1, 32559]) Input IDs shape: torch.Size([1, 32559]) Labels shape: torch.Size([1, 32559]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 36638 Attention mask shape: torch.Size([1, 1, 36638, 36638]) Position ids shape: torch.Size([1, 36638]) Input IDs shape: torch.Size([1, 36638]) Labels shape: torch.Size([1, 36638]) Final batch size: 1, sequence length: 19494 Attention mask shape: torch.Size([1, 1, 19494, 19494]) Position ids shape: torch.Size([1, 19494]) Input IDs shape: torch.Size([1, 19494]) Labels shape: torch.Size([1, 19494]) Final batch size: 1, sequence length: 31316 Attention mask shape: torch.Size([1, 1, 31316, 31316]) Position ids shape: torch.Size([1, 31316]) Input IDs shape: torch.Size([1, 31316]) Labels shape: torch.Size([1, 31316]) Final batch size: 1, sequence length: 30835 Attention mask shape: torch.Size([1, 1, 30835, 30835]) Position ids shape: torch.Size([1, 30835]) Input IDs shape: torch.Size([1, 30835]) Labels shape: torch.Size([1, 30835]) Final batch size: 1, sequence length: 23936 Attention mask shape: torch.Size([1, 1, 23936, 23936]) Position ids shape: torch.Size([1, 23936]) Input IDs shape: torch.Size([1, 23936]) Labels shape: torch.Size([1, 23936]) Final batch size: 1, sequence length: 27819 Attention mask shape: torch.Size([1, 1, 27819, 27819]) Position ids shape: torch.Size([1, 27819]) Input IDs shape: torch.Size([1, 27819]) Labels shape: torch.Size([1, 27819]) Final batch size: 1, sequence length: 31754 Attention mask shape: torch.Size([1, 1, 31754, 31754]) Position ids shape: torch.Size([1, 31754]) Input IDs shape: torch.Size([1, 31754]) Labels shape: torch.Size([1, 31754]) Final batch size: 1, sequence length: 12605 Attention mask shape: torch.Size([1, 1, 12605, 12605]) Position ids shape: torch.Size([1, 12605]) Input IDs shape: torch.Size([1, 12605]) Labels shape: torch.Size([1, 12605]) Final batch size: 1, sequence length: 25573 Attention mask shape: torch.Size([1, 1, 25573, 25573]) Position ids shape: torch.Size([1, 25573]) Input IDs shape: torch.Size([1, 25573]) Labels shape: torch.Size([1, 25573]) Final batch size: 1, sequence length: 40763 Attention mask shape: torch.Size([1, 1, 40763, 40763]) Position ids shape: torch.Size([1, 40763]) Input IDs shape: torch.Size([1, 40763]) Labels shape: torch.Size([1, 40763]) Final batch size: 1, sequence length: 26269 Attention mask shape: torch.Size([1, 1, 26269, 26269]) Position ids shape: torch.Size([1, 26269]) Input IDs shape: torch.Size([1, 26269]) Labels shape: torch.Size([1, 26269]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 37170 Attention mask shape: torch.Size([1, 1, 37170, 37170]) Position ids shape: torch.Size([1, 37170]) Input IDs shape: torch.Size([1, 37170]) Labels shape: torch.Size([1, 37170]) {'loss': 0.2781, 'grad_norm': 0.3440715022778946, 'learning_rate': 5.2616797812147205e-06, 'num_tokens': -inf, 'epoch': 4.25} Final batch size: 1, sequence length: 7998 Attention mask shape: torch.Size([1, 1, 7998, 7998]) Position ids shape: torch.Size([1, 7998]) Input IDs shape: torch.Size([1, 7998]) Labels shape: torch.Size([1, 7998]) Final batch size: 1, sequence length: 6925 Attention mask shape: torch.Size([1, 1, 6925, 6925]) Position ids shape: torch.Size([1, 6925]) Input IDs shape: torch.Size([1, 6925]) Labels shape: torch.Size([1, 6925]) Final batch size: 1, sequence length: 7402 Attention mask shape: torch.Size([1, 1, 7402, 7402]) Position ids shape: torch.Size([1, 7402]) Input IDs shape: torch.Size([1, 7402]) Labels shape: torch.Size([1, 7402]) Final batch size: 1, sequence length: 10102 Attention mask shape: torch.Size([1, 1, 10102, 10102]) Position ids shape: torch.Size([1, 10102]) Input IDs shape: torch.Size([1, 10102]) Labels shape: torch.Size([1, 10102]) Final batch size: 1, sequence length: 9243 Attention mask shape: torch.Size([1, 1, 9243, 9243]) Position ids shape: torch.Size([1, 9243]) Input IDs shape: torch.Size([1, 9243]) Labels shape: torch.Size([1, 9243]) Final batch size: 1, sequence length: 9452 Attention mask shape: torch.Size([1, 1, 9452, 9452]) Position ids shape: torch.Size([1, 9452]) Input IDs shape: torch.Size([1, 9452]) Labels shape: torch.Size([1, 9452]) Final batch size: 1, sequence length: 10862 Attention mask shape: torch.Size([1, 1, 10862, 10862]) Position ids shape: torch.Size([1, 10862]) Input IDs shape: torch.Size([1, 10862]) Labels shape: torch.Size([1, 10862]) Final batch size: 1, sequence length: 11947 Attention mask shape: torch.Size([1, 1, 11947, 11947]) Position ids shape: torch.Size([1, 11947]) Input IDs shape: torch.Size([1, 11947]) Labels shape: torch.Size([1, 11947]) Final batch size: 1, sequence length: 10804 Attention mask shape: torch.Size([1, 1, 10804, 10804]) Position ids shape: torch.Size([1, 10804]) Input IDs shape: torch.Size([1, 10804]) Labels shape: torch.Size([1, 10804]) Final batch size: 1, sequence length: 12719 Attention mask shape: torch.Size([1, 1, 12719, 12719]) Position ids shape: torch.Size([1, 12719]) Input IDs shape: torch.Size([1, 12719]) Labels shape: torch.Size([1, 12719]) Final batch size: 1, sequence length: 8432 Attention mask shape: torch.Size([1, 1, 8432, 8432]) Position ids shape: torch.Size([1, 8432]) Input IDs shape: torch.Size([1, 8432]) Labels shape: torch.Size([1, 8432]) Final batch size: 1, sequence length: 11623 Attention mask shape: torch.Size([1, 1, 11623, 11623]) Position ids shape: torch.Size([1, 11623]) Input IDs shape: torch.Size([1, 11623]) Labels shape: torch.Size([1, 11623]) Final batch size: 1, sequence length: 13209 Attention mask shape: torch.Size([1, 1, 13209, 13209]) Position ids shape: torch.Size([1, 13209]) Input IDs shape: torch.Size([1, 13209]) Labels shape: torch.Size([1, 13209]) Final batch size: 1, sequence length: 15794 Attention mask shape: torch.Size([1, 1, 15794, 15794]) Position ids shape: torch.Size([1, 15794]) Input IDs shape: torch.Size([1, 15794]) Labels shape: torch.Size([1, 15794]) Final batch size: 1, sequence length: 16060 Attention mask shape: torch.Size([1, 1, 16060, 16060]) Position ids shape: torch.Size([1, 16060]) Input IDs shape: torch.Size([1, 16060]) Labels shape: torch.Size([1, 16060]) Final batch size: 1, sequence length: 15053 Attention mask shape: torch.Size([1, 1, 15053, 15053]) Position ids shape: torch.Size([1, 15053]) Input IDs shape: torch.Size([1, 15053]) Labels shape: torch.Size([1, 15053]) Final batch size: 1, sequence length: 15886 Attention mask shape: torch.Size([1, 1, 15886, 15886]) Position ids shape: torch.Size([1, 15886]) Input IDs shape: torch.Size([1, 15886]) Labels shape: torch.Size([1, 15886]) Final batch size: 1, sequence length: 17376 Attention mask shape: torch.Size([1, 1, 17376, 17376]) Position ids shape: torch.Size([1, 17376]) Input IDs shape: torch.Size([1, 17376]) Labels shape: torch.Size([1, 17376]) Final batch size: 1, sequence length: 17918 Attention mask shape: torch.Size([1, 1, 17918, 17918]) Position ids shape: torch.Size([1, 17918]) Input IDs shape: torch.Size([1, 17918]) Labels shape: torch.Size([1, 17918]) Final batch size: 1, sequence length: 17370 Attention mask shape: torch.Size([1, 1, 17370, 17370]) Position ids shape: torch.Size([1, 17370]) Input IDs shape: torch.Size([1, 17370]) Labels shape: torch.Size([1, 17370]) Final batch size: 1, sequence length: 19609 Attention mask shape: torch.Size([1, 1, 19609, 19609]) Position ids shape: torch.Size([1, 19609]) Input IDs shape: torch.Size([1, 19609]) Labels shape: torch.Size([1, 19609]) Final batch size: 1, sequence length: 17767 Attention mask shape: torch.Size([1, 1, 17767, 17767]) Position ids shape: torch.Size([1, 17767]) Input IDs shape: torch.Size([1, 17767]) Labels shape: torch.Size([1, 17767]) Final batch size: 1, sequence length: 19935 Attention mask shape: torch.Size([1, 1, 19935, 19935]) Position ids shape: torch.Size([1, 19935]) Input IDs shape: torch.Size([1, 19935]) Labels shape: torch.Size([1, 19935]) Final batch size: 1, sequence length: 20185 Attention mask shape: torch.Size([1, 1, 20185, 20185]) Position ids shape: torch.Size([1, 20185]) Input IDs shape: torch.Size([1, 20185]) Labels shape: torch.Size([1, 20185]) Final batch size: 1, sequence length: 20056 Attention mask shape: torch.Size([1, 1, 20056, 20056]) Position ids shape: torch.Size([1, 20056]) Input IDs shape: torch.Size([1, 20056]) Labels shape: torch.Size([1, 20056]) Final batch size: 1, sequence length: 16536 Attention mask shape: torch.Size([1, 1, 16536, 16536]) Position ids shape: torch.Size([1, 16536]) Input IDs shape: torch.Size([1, 16536]) Labels shape: torch.Size([1, 16536]) Final batch size: 1, sequence length: 17625 Attention mask shape: torch.Size([1, 1, 17625, 17625]) Position ids shape: torch.Size([1, 17625]) Input IDs shape: torch.Size([1, 17625]) Labels shape: torch.Size([1, 17625]) Final batch size: 1, sequence length: 19629 Attention mask shape: torch.Size([1, 1, 19629, 19629]) Position ids shape: torch.Size([1, 19629]) Input IDs shape: torch.Size([1, 19629]) Labels shape: torch.Size([1, 19629]) Final batch size: 1, sequence length: 21117 Attention mask shape: torch.Size([1, 1, 21117, 21117]) Position ids shape: torch.Size([1, 21117]) Input IDs shape: torch.Size([1, 21117]) Labels shape: torch.Size([1, 21117]) Final batch size: 1, sequence length: 21771 Attention mask shape: torch.Size([1, 1, 21771, 21771]) Position ids shape: torch.Size([1, 21771]) Input IDs shape: torch.Size([1, 21771]) Labels shape: torch.Size([1, 21771]) Final batch size: 1, sequence length: 17665 Attention mask shape: torch.Size([1, 1, 17665, 17665]) Position ids shape: torch.Size([1, 17665]) Input IDs shape: torch.Size([1, 17665]) Labels shape: torch.Size([1, 17665]) Final batch size: 1, sequence length: 21421 Attention mask shape: torch.Size([1, 1, 21421, 21421]) Position ids shape: torch.Size([1, 21421]) Input IDs shape: torch.Size([1, 21421]) Labels shape: torch.Size([1, 21421]) Final batch size: 1, sequence length: 8498 Attention mask shape: torch.Size([1, 1, 8498, 8498]) Position ids shape: torch.Size([1, 8498]) Input IDs shape: torch.Size([1, 8498]) Labels shape: torch.Size([1, 8498]) Final batch size: 1, sequence length: 22797 Attention mask shape: torch.Size([1, 1, 22797, 22797]) Position ids shape: torch.Size([1, 22797]) Input IDs shape: torch.Size([1, 22797]) Labels shape: torch.Size([1, 22797]) Final batch size: 1, sequence length: 21980 Attention mask shape: torch.Size([1, 1, 21980, 21980]) Position ids shape: torch.Size([1, 21980]) Input IDs shape: torch.Size([1, 21980]) Labels shape: torch.Size([1, 21980]) Final batch size: 1, sequence length: 20554 Attention mask shape: torch.Size([1, 1, 20554, 20554]) Position ids shape: torch.Size([1, 20554]) Input IDs shape: torch.Size([1, 20554]) Labels shape: torch.Size([1, 20554]) Final batch size: 1, sequence length: 12006 Attention mask shape: torch.Size([1, 1, 12006, 12006]) Position ids shape: torch.Size([1, 12006]) Input IDs shape: torch.Size([1, 12006]) Labels shape: torch.Size([1, 12006]) Final batch size: 1, sequence length: 20524 Attention mask shape: torch.Size([1, 1, 20524, 20524]) Position ids shape: torch.Size([1, 20524]) Input IDs shape: torch.Size([1, 20524]) Labels shape: torch.Size([1, 20524]) Final batch size: 1, sequence length: 16141 Attention mask shape: torch.Size([1, 1, 16141, 16141]) Position ids shape: torch.Size([1, 16141]) Input IDs shape: torch.Size([1, 16141]) Labels shape: torch.Size([1, 16141]) Final batch size: 1, sequence length: 24308 Attention mask shape: torch.Size([1, 1, 24308, 24308]) Position ids shape: torch.Size([1, 24308]) Input IDs shape: torch.Size([1, 24308]) Labels shape: torch.Size([1, 24308]) Final batch size: 1, sequence length: 18991 Attention mask shape: torch.Size([1, 1, 18991, 18991]) Position ids shape: torch.Size([1, 18991]) Input IDs shape: torch.Size([1, 18991]) Labels shape: torch.Size([1, 18991]) Final batch size: 1, sequence length: 22208 Attention mask shape: torch.Size([1, 1, 22208, 22208]) Position ids shape: torch.Size([1, 22208]) Input IDs shape: torch.Size([1, 22208]) Labels shape: torch.Size([1, 22208]) Final batch size: 1, sequence length: 27179 Attention mask shape: torch.Size([1, 1, 27179, 27179]) Position ids shape: torch.Size([1, 27179]) Input IDs shape: torch.Size([1, 27179]) Labels shape: torch.Size([1, 27179]) Final batch size: 1, sequence length: 20816 Attention mask shape: torch.Size([1, 1, 20816, 20816]) Position ids shape: torch.Size([1, 20816]) Input IDs shape: torch.Size([1, 20816]) Labels shape: torch.Size([1, 20816]) Final batch size: 1, sequence length: 26142 Attention mask shape: torch.Size([1, 1, 26142, 26142]) Position ids shape: torch.Size([1, 26142]) Input IDs shape: torch.Size([1, 26142]) Labels shape: torch.Size([1, 26142]) Final batch size: 1, sequence length: 24515 Attention mask shape: torch.Size([1, 1, 24515, 24515]) Position ids shape: torch.Size([1, 24515]) Input IDs shape: torch.Size([1, 24515]) Labels shape: torch.Size([1, 24515]) Final batch size: 1, sequence length: 16439 Attention mask shape: torch.Size([1, 1, 16439, 16439]) Position ids shape: torch.Size([1, 16439]) Input IDs shape: torch.Size([1, 16439]) Labels shape: torch.Size([1, 16439]) Final batch size: 1, sequence length: 24433 Attention mask shape: torch.Size([1, 1, 24433, 24433]) Position ids shape: torch.Size([1, 24433]) Input IDs shape: torch.Size([1, 24433]) Labels shape: torch.Size([1, 24433]) Final batch size: 1, sequence length: 26449 Attention mask shape: torch.Size([1, 1, 26449, 26449]) Position ids shape: torch.Size([1, 26449]) Input IDs shape: torch.Size([1, 26449]) Labels shape: torch.Size([1, 26449]) Final batch size: 1, sequence length: 18122 Attention mask shape: torch.Size([1, 1, 18122, 18122]) Position ids shape: torch.Size([1, 18122]) Input IDs shape: torch.Size([1, 18122]) Labels shape: torch.Size([1, 18122]) Final batch size: 1, sequence length: 21152 Attention mask shape: torch.Size([1, 1, 21152, 21152]) Position ids shape: torch.Size([1, 21152]) Input IDs shape: torch.Size([1, 21152]) Labels shape: torch.Size([1, 21152]) Final batch size: 1, sequence length: 28164 Final batch size: 1, sequence length: 16564 Attention mask shape: torch.Size([1, 1, 28164, 28164]) Position ids shape: torch.Size([1, 28164]) Attention mask shape: torch.Size([1, 1, 16564, 16564]) Input IDs shape: torch.Size([1, 28164]) Position ids shape: torch.Size([1, 16564]) Labels shape: torch.Size([1, 28164]) Input IDs shape: torch.Size([1, 16564]) Labels shape: torch.Size([1, 16564]) Final batch size: 1, sequence length: 22683 Attention mask shape: torch.Size([1, 1, 22683, 22683]) Position ids shape: torch.Size([1, 22683]) Input IDs shape: torch.Size([1, 22683]) Labels shape: torch.Size([1, 22683]) Final batch size: 1, sequence length: 29628 Attention mask shape: torch.Size([1, 1, 29628, 29628]) Position ids shape: torch.Size([1, 29628]) Input IDs shape: torch.Size([1, 29628]) Labels shape: torch.Size([1, 29628]) Final batch size: 1, sequence length: 17363 Attention mask shape: torch.Size([1, 1, 17363, 17363]) Position ids shape: torch.Size([1, 17363]) Input IDs shape: torch.Size([1, 17363]) Labels shape: torch.Size([1, 17363]) Final batch size: 1, sequence length: 31339 Attention mask shape: torch.Size([1, 1, 31339, 31339]) Position ids shape: torch.Size([1, 31339]) Input IDs shape: torch.Size([1, 31339]) Labels shape: torch.Size([1, 31339]) Final batch size: 1, sequence length: 24802 Attention mask shape: torch.Size([1, 1, 24802, 24802]) Position ids shape: torch.Size([1, 24802]) Input IDs shape: torch.Size([1, 24802]) Labels shape: torch.Size([1, 24802]) Final batch size: 1, sequence length: 31507 Attention mask shape: torch.Size([1, 1, 31507, 31507]) Position ids shape: torch.Size([1, 31507]) Input IDs shape: torch.Size([1, 31507]) Labels shape: torch.Size([1, 31507]) Final batch size: 1, sequence length: 16837 Attention mask shape: torch.Size([1, 1, 16837, 16837]) Position ids shape: torch.Size([1, 16837]) Input IDs shape: torch.Size([1, 16837]) Labels shape: torch.Size([1, 16837]) Final batch size: 1, sequence length: 32752 Attention mask shape: torch.Size([1, 1, 32752, 32752]) Position ids shape: torch.Size([1, 32752]) Input IDs shape: torch.Size([1, 32752]) Labels shape: torch.Size([1, 32752]) Final batch size: 1, sequence length: 26121 Attention mask shape: torch.Size([1, 1, 26121, 26121]) Position ids shape: torch.Size([1, 26121]) Input IDs shape: torch.Size([1, 26121]) Labels shape: torch.Size([1, 26121]) Final batch size: 1, sequence length: 24527 Attention mask shape: torch.Size([1, 1, 24527, 24527]) Position ids shape: torch.Size([1, 24527]) Input IDs shape: torch.Size([1, 24527]) Labels shape: torch.Size([1, 24527]) Final batch size: 1, sequence length: 29392 Attention mask shape: torch.Size([1, 1, 29392, 29392]) Position ids shape: torch.Size([1, 29392]) Input IDs shape: torch.Size([1, 29392]) Labels shape: torch.Size([1, 29392]) Final batch size: 1, sequence length: 30181 Attention mask shape: torch.Size([1, 1, 30181, 30181]) Position ids shape: torch.Size([1, 30181]) Input IDs shape: torch.Size([1, 30181]) Labels shape: torch.Size([1, 30181]) Final batch size: 1, sequence length: 32466 Attention mask shape: torch.Size([1, 1, 32466, 32466]) Position ids shape: torch.Size([1, 32466]) Input IDs shape: torch.Size([1, 32466]) Labels shape: torch.Size([1, 32466]) Final batch size: 1, sequence length: 34711 Attention mask shape: torch.Size([1, 1, 34711, 34711]) Position ids shape: torch.Size([1, 34711]) Input IDs shape: torch.Size([1, 34711]) Labels shape: torch.Size([1, 34711]) Final batch size: 1, sequence length: 30197 Attention mask shape: torch.Size([1, 1, 30197, 30197]) Position ids shape: torch.Size([1, 30197]) Input IDs shape: torch.Size([1, 30197]) Labels shape: torch.Size([1, 30197]) Final batch size: 1, sequence length: 32891 Attention mask shape: torch.Size([1, 1, 32891, 32891]) Position ids shape: torch.Size([1, 32891]) Input IDs shape: torch.Size([1, 32891]) Labels shape: torch.Size([1, 32891]) Final batch size: 1, sequence length: 10857 Attention mask shape: torch.Size([1, 1, 10857, 10857]) Position ids shape: torch.Size([1, 10857]) Input IDs shape: torch.Size([1, 10857]) Labels shape: torch.Size([1, 10857]) Final batch size: 1, sequence length: 11107 Attention mask shape: torch.Size([1, 1, 11107, 11107]) Position ids shape: torch.Size([1, 11107]) Input IDs shape: torch.Size([1, 11107]) Labels shape: torch.Size([1, 11107]) Final batch size: 1, sequence length: 20274 Attention mask shape: torch.Size([1, 1, 20274, 20274]) Position ids shape: torch.Size([1, 20274]) Input IDs shape: torch.Size([1, 20274]) Labels shape: torch.Size([1, 20274]) Final batch size: 1, sequence length: 33894 Attention mask shape: torch.Size([1, 1, 33894, 33894]) Position ids shape: torch.Size([1, 33894]) Input IDs shape: torch.Size([1, 33894]) Labels shape: torch.Size([1, 33894]) Final batch size: 1, sequence length: 22762 Attention mask shape: torch.Size([1, 1, 22762, 22762]) Position ids shape: torch.Size([1, 22762]) Input IDs shape: torch.Size([1, 22762]) Labels shape: torch.Size([1, 22762]) Final batch size: 1, sequence length: 33087 Attention mask shape: torch.Size([1, 1, 33087, 33087]) Position ids shape: torch.Size([1, 33087]) Input IDs shape: torch.Size([1, 33087]) Labels shape: torch.Size([1, 33087]) Final batch size: 1, sequence length: 21143 Attention mask shape: torch.Size([1, 1, 21143, 21143]) Position ids shape: torch.Size([1, 21143]) Input IDs shape: torch.Size([1, 21143]) Labels shape: torch.Size([1, 21143]) Final batch size: 1, sequence length: 34405 Attention mask shape: torch.Size([1, 1, 34405, 34405]) Position ids shape: torch.Size([1, 34405]) Input IDs shape: torch.Size([1, 34405]) Labels shape: torch.Size([1, 34405]) Final batch size: 1, sequence length: 37555 Attention mask shape: torch.Size([1, 1, 37555, 37555]) Position ids shape: torch.Size([1, 37555]) Input IDs shape: torch.Size([1, 37555]) Labels shape: torch.Size([1, 37555]) Final batch size: 1, sequence length: 31525 Attention mask shape: torch.Size([1, 1, 31525, 31525]) Position ids shape: torch.Size([1, 31525]) Input IDs shape: torch.Size([1, 31525]) Labels shape: torch.Size([1, 31525]) Final batch size: 1, sequence length: 34512 Attention mask shape: torch.Size([1, 1, 34512, 34512]) Position ids shape: torch.Size([1, 34512]) Input IDs shape: torch.Size([1, 34512]) Labels shape: torch.Size([1, 34512]) Final batch size: 1, sequence length: 31875 Attention mask shape: torch.Size([1, 1, 31875, 31875]) Position ids shape: torch.Size([1, 31875]) Input IDs shape: torch.Size([1, 31875]) Labels shape: torch.Size([1, 31875]) Final batch size: 1, sequence length: 13942 Attention mask shape: torch.Size([1, 1, 13942, 13942]) Position ids shape: torch.Size([1, 13942]) Input IDs shape: torch.Size([1, 13942]) Labels shape: torch.Size([1, 13942]) Final batch size: 1, sequence length: 33794 Attention mask shape: torch.Size([1, 1, 33794, 33794]) Position ids shape: torch.Size([1, 33794]) Input IDs shape: torch.Size([1, 33794]) Labels shape: torch.Size([1, 33794]) Final batch size: 1, sequence length: 37343 Attention mask shape: torch.Size([1, 1, 37343, 37343]) Position ids shape: torch.Size([1, 37343]) Input IDs shape: torch.Size([1, 37343]) Labels shape: torch.Size([1, 37343]) Final batch size: 1, sequence length: 27061 Attention mask shape: torch.Size([1, 1, 27061, 27061]) Position ids shape: torch.Size([1, 27061]) Input IDs shape: torch.Size([1, 27061]) Labels shape: torch.Size([1, 27061]) Final batch size: 1, sequence length: 31561 Attention mask shape: torch.Size([1, 1, 31561, 31561]) Position ids shape: torch.Size([1, 31561]) Input IDs shape: torch.Size([1, 31561]) Labels shape: torch.Size([1, 31561]) Final batch size: 1, sequence length: 31518 Attention mask shape: torch.Size([1, 1, 31518, 31518]) Position ids shape: torch.Size([1, 31518]) Input IDs shape: torch.Size([1, 31518]) Labels shape: torch.Size([1, 31518]) Final batch size: 1, sequence length: 38010 Attention mask shape: torch.Size([1, 1, 38010, 38010]) Position ids shape: torch.Size([1, 38010]) Input IDs shape: torch.Size([1, 38010]) Labels shape: torch.Size([1, 38010]) Final batch size: 1, sequence length: 17181 Attention mask shape: torch.Size([1, 1, 17181, 17181]) Position ids shape: torch.Size([1, 17181]) Input IDs shape: torch.Size([1, 17181]) Labels shape: torch.Size([1, 17181]) Final batch size: 1, sequence length: 18229 Attention mask shape: torch.Size([1, 1, 18229, 18229]) Position ids shape: torch.Size([1, 18229]) Input IDs shape: torch.Size([1, 18229]) Labels shape: torch.Size([1, 18229]) Final batch size: 1, sequence length: 35523 Attention mask shape: torch.Size([1, 1, 35523, 35523]) Position ids shape: torch.Size([1, 35523]) Input IDs shape: torch.Size([1, 35523]) Labels shape: torch.Size([1, 35523]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 36450 Attention mask shape: torch.Size([1, 1, 36450, 36450]) Position ids shape: torch.Size([1, 36450]) Input IDs shape: torch.Size([1, 36450]) Labels shape: torch.Size([1, 36450]) Final batch size: 1, sequence length: 29216 Attention mask shape: torch.Size([1, 1, 29216, 29216]) Position ids shape: torch.Size([1, 29216]) Input IDs shape: torch.Size([1, 29216]) Labels shape: torch.Size([1, 29216]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 13978 Attention mask shape: torch.Size([1, 1, 13978, 13978]) Position ids shape: torch.Size([1, 13978]) Input IDs shape: torch.Size([1, 13978]) Labels shape: torch.Size([1, 13978]) Final batch size: 1, sequence length: 33367 Attention mask shape: torch.Size([1, 1, 33367, 33367]) Position ids shape: torch.Size([1, 33367]) Input IDs shape: torch.Size([1, 33367]) Labels shape: torch.Size([1, 33367]) Final batch size: 1, sequence length: 16716 Attention mask shape: torch.Size([1, 1, 16716, 16716]) Position ids shape: torch.Size([1, 16716]) Input IDs shape: torch.Size([1, 16716]) Labels shape: torch.Size([1, 16716]) Final batch size: 1, sequence length: 40910 Attention mask shape: torch.Size([1, 1, 40910, 40910]) Position ids shape: torch.Size([1, 40910]) Input IDs shape: torch.Size([1, 40910]) Labels shape: torch.Size([1, 40910]) Final batch size: 1, sequence length: 36464 Attention mask shape: torch.Size([1, 1, 36464, 36464]) Position ids shape: torch.Size([1, 36464]) Input IDs shape: torch.Size([1, 36464]) Labels shape: torch.Size([1, 36464]) Final batch size: 1, sequence length: 21859 Attention mask shape: torch.Size([1, 1, 21859, 21859]) Position ids shape: torch.Size([1, 21859]) Input IDs shape: torch.Size([1, 21859]) Labels shape: torch.Size([1, 21859]) Final batch size: 1, sequence length: 16433 Attention mask shape: torch.Size([1, 1, 16433, 16433]) Position ids shape: torch.Size([1, 16433]) Input IDs shape: torch.Size([1, 16433]) Labels shape: torch.Size([1, 16433]) Final batch size: 1, sequence length: 19287 Attention mask shape: torch.Size([1, 1, 19287, 19287]) Position ids shape: torch.Size([1, 19287]) Input IDs shape: torch.Size([1, 19287]) Labels shape: torch.Size([1, 19287]) Final batch size: 1, sequence length: 29042 Attention mask shape: torch.Size([1, 1, 29042, 29042]) Position ids shape: torch.Size([1, 29042]) Input IDs shape: torch.Size([1, 29042]) Labels shape: torch.Size([1, 29042]) Final batch size: 1, sequence length: 30134 Attention mask shape: torch.Size([1, 1, 30134, 30134]) Position ids shape: torch.Size([1, 30134]) Input IDs shape: torch.Size([1, 30134]) Labels shape: torch.Size([1, 30134]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 37349 Attention mask shape: torch.Size([1, 1, 37349, 37349]) Position ids shape: torch.Size([1, 37349]) Input IDs shape: torch.Size([1, 37349]) Labels shape: torch.Size([1, 37349]) Final batch size: 1, sequence length: 31449 Attention mask shape: torch.Size([1, 1, 31449, 31449]) Position ids shape: torch.Size([1, 31449]) Input IDs shape: torch.Size([1, 31449]) Labels shape: torch.Size([1, 31449]) Final batch size: 1, sequence length: 40752 Attention mask shape: torch.Size([1, 1, 40752, 40752]) Position ids shape: torch.Size([1, 40752]) Input IDs shape: torch.Size([1, 40752]) Labels shape: torch.Size([1, 40752]) Final batch size: 1, sequence length: 17758 Attention mask shape: torch.Size([1, 1, 17758, 17758]) Position ids shape: torch.Size([1, 17758]) Input IDs shape: torch.Size([1, 17758]) Labels shape: torch.Size([1, 17758]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 39169 Attention mask shape: torch.Size([1, 1, 39169, 39169]) Position ids shape: torch.Size([1, 39169]) Input IDs shape: torch.Size([1, 39169]) Labels shape: torch.Size([1, 39169]) Final batch size: 1, sequence length: 26893 Attention mask shape: torch.Size([1, 1, 26893, 26893]) Position ids shape: torch.Size([1, 26893]) Input IDs shape: torch.Size([1, 26893]) Labels shape: torch.Size([1, 26893]) Final batch size: 1, sequence length: 38529 Attention mask shape: torch.Size([1, 1, 38529, 38529]) Position ids shape: torch.Size([1, 38529]) Input IDs shape: torch.Size([1, 38529]) Labels shape: torch.Size([1, 38529]) Final batch size: 1, sequence length: 26922 Attention mask shape: torch.Size([1, 1, 26922, 26922]) Position ids shape: torch.Size([1, 26922]) Input IDs shape: torch.Size([1, 26922]) Labels shape: torch.Size([1, 26922]) Final batch size: 1, sequence length: 17535 Attention mask shape: torch.Size([1, 1, 17535, 17535]) Position ids shape: torch.Size([1, 17535]) Input IDs shape: torch.Size([1, 17535]) Labels shape: torch.Size([1, 17535]) Final batch size: 1, sequence length: 39258 Attention mask shape: torch.Size([1, 1, 39258, 39258]) Position ids shape: torch.Size([1, 39258]) Input IDs shape: torch.Size([1, 39258]) Labels shape: torch.Size([1, 39258]) Final batch size: 1, sequence length: 17890 Attention mask shape: torch.Size([1, 1, 17890, 17890]) Position ids shape: torch.Size([1, 17890]) Input IDs shape: torch.Size([1, 17890]) Labels shape: torch.Size([1, 17890]) Final batch size: 1, sequence length: 32638 Attention mask shape: torch.Size([1, 1, 32638, 32638]) Position ids shape: torch.Size([1, 32638]) Input IDs shape: torch.Size([1, 32638]) Labels shape: torch.Size([1, 32638]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 38891 Attention mask shape: torch.Size([1, 1, 38891, 38891]) Position ids shape: torch.Size([1, 38891]) Input IDs shape: torch.Size([1, 38891]) Labels shape: torch.Size([1, 38891]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 18471 Attention mask shape: torch.Size([1, 1, 18471, 18471]) Position ids shape: torch.Size([1, 18471]) Input IDs shape: torch.Size([1, 18471]) Labels shape: torch.Size([1, 18471]) Final batch size: 1, sequence length: 32465 Attention mask shape: torch.Size([1, 1, 32465, 32465]) Position ids shape: torch.Size([1, 32465]) Input IDs shape: torch.Size([1, 32465]) Labels shape: torch.Size([1, 32465]) Final batch size: 1, sequence length: 36599 Attention mask shape: torch.Size([1, 1, 36599, 36599]) Position ids shape: torch.Size([1, 36599]) Input IDs shape: torch.Size([1, 36599]) Labels shape: torch.Size([1, 36599]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 39953 Attention mask shape: torch.Size([1, 1, 39953, 39953]) Position ids shape: torch.Size([1, 39953]) Input IDs shape: torch.Size([1, 39953]) Labels shape: torch.Size([1, 39953]) {'loss': 0.2664, 'grad_norm': 0.37374961648456445, 'learning_rate': 5e-06, 'num_tokens': -inf, 'epoch': 4.38} Final batch size: 1, sequence length: 5818 Attention mask shape: torch.Size([1, 1, 5818, 5818]) Position ids shape: torch.Size([1, 5818]) Input IDs shape: torch.Size([1, 5818]) Labels shape: torch.Size([1, 5818]) Final batch size: 1, sequence length: 7977 Attention mask shape: torch.Size([1, 1, 7977, 7977]) Position ids shape: torch.Size([1, 7977]) Input IDs shape: torch.Size([1, 7977]) Labels shape: torch.Size([1, 7977]) Final batch size: 1, sequence length: 10107 Attention mask shape: torch.Size([1, 1, 10107, 10107]) Position ids shape: torch.Size([1, 10107]) Input IDs shape: torch.Size([1, 10107]) Labels shape: torch.Size([1, 10107]) Final batch size: 1, sequence length: 8500 Attention mask shape: torch.Size([1, 1, 8500, 8500]) Position ids shape: torch.Size([1, 8500]) Input IDs shape: torch.Size([1, 8500]) Labels shape: torch.Size([1, 8500]) Final batch size: 1, sequence length: 10080 Attention mask shape: torch.Size([1, 1, 10080, 10080]) Position ids shape: torch.Size([1, 10080]) Input IDs shape: torch.Size([1, 10080]) Labels shape: torch.Size([1, 10080]) Final batch size: 1, sequence length: 6215 Attention mask shape: torch.Size([1, 1, 6215, 6215]) Position ids shape: torch.Size([1, 6215]) Input IDs shape: torch.Size([1, 6215]) Labels shape: torch.Size([1, 6215]) Final batch size: 1, sequence length: 12454 Attention mask shape: torch.Size([1, 1, 12454, 12454]) Position ids shape: torch.Size([1, 12454]) Input IDs shape: torch.Size([1, 12454]) Labels shape: torch.Size([1, 12454]) Final batch size: 1, sequence length: 12826 Attention mask shape: torch.Size([1, 1, 12826, 12826]) Position ids shape: torch.Size([1, 12826]) Input IDs shape: torch.Size([1, 12826]) Labels shape: torch.Size([1, 12826]) Final batch size: 1, sequence length: 12928 Attention mask shape: torch.Size([1, 1, 12928, 12928]) Position ids shape: torch.Size([1, 12928]) Input IDs shape: torch.Size([1, 12928]) Labels shape: torch.Size([1, 12928]) Final batch size: 1, sequence length: 10505 Attention mask shape: torch.Size([1, 1, 10505, 10505]) Position ids shape: torch.Size([1, 10505]) Input IDs shape: torch.Size([1, 10505]) Labels shape: torch.Size([1, 10505]) Final batch size: 1, sequence length: 13624 Attention mask shape: torch.Size([1, 1, 13624, 13624]) Position ids shape: torch.Size([1, 13624]) Input IDs shape: torch.Size([1, 13624]) Labels shape: torch.Size([1, 13624]) Final batch size: 1, sequence length: 12556 Attention mask shape: torch.Size([1, 1, 12556, 12556]) Position ids shape: torch.Size([1, 12556]) Input IDs shape: torch.Size([1, 12556]) Labels shape: torch.Size([1, 12556]) Final batch size: 1, sequence length: 6095 Attention mask shape: torch.Size([1, 1, 6095, 6095]) Position ids shape: torch.Size([1, 6095]) Input IDs shape: torch.Size([1, 6095]) Labels shape: torch.Size([1, 6095]) Final batch size: 1, sequence length: 10687 Final batch size: 1, sequence length: 13459 Attention mask shape: torch.Size([1, 1, 10687, 10687]) Position ids shape: torch.Size([1, 10687]) Input IDs shape: torch.Size([1, 10687]) Labels shape: torch.Size([1, 10687]) Attention mask shape: torch.Size([1, 1, 13459, 13459]) Position ids shape: torch.Size([1, 13459]) Input IDs shape: torch.Size([1, 13459]) Labels shape: torch.Size([1, 13459]) Final batch size: 1, sequence length: 9379 Attention mask shape: torch.Size([1, 1, 9379, 9379]) Position ids shape: torch.Size([1, 9379]) Input IDs shape: torch.Size([1, 9379]) Labels shape: torch.Size([1, 9379]) Final batch size: 1, sequence length: 9217 Attention mask shape: torch.Size([1, 1, 9217, 9217]) Position ids shape: torch.Size([1, 9217]) Input IDs shape: torch.Size([1, 9217]) Labels shape: torch.Size([1, 9217]) Final batch size: 1, sequence length: 13092 Attention mask shape: torch.Size([1, 1, 13092, 13092]) Position ids shape: torch.Size([1, 13092]) Input IDs shape: torch.Size([1, 13092]) Labels shape: torch.Size([1, 13092]) Final batch size: 1, sequence length: 15185 Attention mask shape: torch.Size([1, 1, 15185, 15185]) Position ids shape: torch.Size([1, 15185]) Input IDs shape: torch.Size([1, 15185]) Labels shape: torch.Size([1, 15185]) Final batch size: 1, sequence length: 10390 Attention mask shape: torch.Size([1, 1, 10390, 10390]) Position ids shape: torch.Size([1, 10390]) Input IDs shape: torch.Size([1, 10390]) Labels shape: torch.Size([1, 10390]) Final batch size: 1, sequence length: 16782 Attention mask shape: torch.Size([1, 1, 16782, 16782]) Position ids shape: torch.Size([1, 16782]) Input IDs shape: torch.Size([1, 16782]) Labels shape: torch.Size([1, 16782]) Final batch size: 1, sequence length: 17980 Attention mask shape: torch.Size([1, 1, 17980, 17980]) Position ids shape: torch.Size([1, 17980]) Input IDs shape: torch.Size([1, 17980]) Labels shape: torch.Size([1, 17980]) Final batch size: 1, sequence length: 10469 Attention mask shape: torch.Size([1, 1, 10469, 10469]) Position ids shape: torch.Size([1, 10469]) Input IDs shape: torch.Size([1, 10469]) Labels shape: torch.Size([1, 10469]) Final batch size: 1, sequence length: 18454 Attention mask shape: torch.Size([1, 1, 18454, 18454]) Position ids shape: torch.Size([1, 18454]) Input IDs shape: torch.Size([1, 18454]) Labels shape: torch.Size([1, 18454]) Final batch size: 1, sequence length: 18469 Attention mask shape: torch.Size([1, 1, 18469, 18469]) Position ids shape: torch.Size([1, 18469]) Input IDs shape: torch.Size([1, 18469]) Labels shape: torch.Size([1, 18469]) Final batch size: 1, sequence length: 17417 Attention mask shape: torch.Size([1, 1, 17417, 17417]) Position ids shape: torch.Size([1, 17417]) Input IDs shape: torch.Size([1, 17417]) Labels shape: torch.Size([1, 17417]) Final batch size: 1, sequence length: 19683 Attention mask shape: torch.Size([1, 1, 19683, 19683]) Position ids shape: torch.Size([1, 19683]) Input IDs shape: torch.Size([1, 19683]) Labels shape: torch.Size([1, 19683]) Final batch size: 1, sequence length: 20243 Attention mask shape: torch.Size([1, 1, 20243, 20243]) Position ids shape: torch.Size([1, 20243]) Input IDs shape: torch.Size([1, 20243]) Labels shape: torch.Size([1, 20243]) Final batch size: 1, sequence length: 15263 Attention mask shape: torch.Size([1, 1, 15263, 15263]) Position ids shape: torch.Size([1, 15263]) Input IDs shape: torch.Size([1, 15263]) Labels shape: torch.Size([1, 15263]) Final batch size: 1, sequence length: 10277 Attention mask shape: torch.Size([1, 1, 10277, 10277]) Position ids shape: torch.Size([1, 10277]) Input IDs shape: torch.Size([1, 10277]) Labels shape: torch.Size([1, 10277]) Final batch size: 1, sequence length: 20191 Attention mask shape: torch.Size([1, 1, 20191, 20191]) Position ids shape: torch.Size([1, 20191]) Input IDs shape: torch.Size([1, 20191]) Labels shape: torch.Size([1, 20191]) Final batch size: 1, sequence length: 19671 Attention mask shape: torch.Size([1, 1, 19671, 19671]) Position ids shape: torch.Size([1, 19671]) Input IDs shape: torch.Size([1, 19671]) Labels shape: torch.Size([1, 19671]) Final batch size: 1, sequence length: 21051 Attention mask shape: torch.Size([1, 1, 21051, 21051]) Position ids shape: torch.Size([1, 21051]) Input IDs shape: torch.Size([1, 21051]) Labels shape: torch.Size([1, 21051]) Final batch size: 1, sequence length: 16585 Attention mask shape: torch.Size([1, 1, 16585, 16585]) Position ids shape: torch.Size([1, 16585]) Input IDs shape: torch.Size([1, 16585]) Labels shape: torch.Size([1, 16585]) Final batch size: 1, sequence length: 20432 Attention mask shape: torch.Size([1, 1, 20432, 20432]) Position ids shape: torch.Size([1, 20432]) Input IDs shape: torch.Size([1, 20432]) Labels shape: torch.Size([1, 20432]) Final batch size: 1, sequence length: 23238 Attention mask shape: torch.Size([1, 1, 23238, 23238]) Position ids shape: torch.Size([1, 23238]) Input IDs shape: torch.Size([1, 23238]) Labels shape: torch.Size([1, 23238]) Final batch size: 1, sequence length: 14892 Attention mask shape: torch.Size([1, 1, 14892, 14892]) Position ids shape: torch.Size([1, 14892]) Input IDs shape: torch.Size([1, 14892]) Labels shape: torch.Size([1, 14892]) Final batch size: 1, sequence length: 22133 Attention mask shape: torch.Size([1, 1, 22133, 22133]) Position ids shape: torch.Size([1, 22133]) Input IDs shape: torch.Size([1, 22133]) Labels shape: torch.Size([1, 22133]) Final batch size: 1, sequence length: 12012 Attention mask shape: torch.Size([1, 1, 12012, 12012]) Position ids shape: torch.Size([1, 12012]) Input IDs shape: torch.Size([1, 12012]) Labels shape: torch.Size([1, 12012]) Final batch size: 1, sequence length: 24255 Attention mask shape: torch.Size([1, 1, 24255, 24255]) Position ids shape: torch.Size([1, 24255]) Input IDs shape: torch.Size([1, 24255]) Labels shape: torch.Size([1, 24255]) Final batch size: 1, sequence length: 20562 Attention mask shape: torch.Size([1, 1, 20562, 20562]) Position ids shape: torch.Size([1, 20562]) Input IDs shape: torch.Size([1, 20562]) Labels shape: torch.Size([1, 20562]) Final batch size: 1, sequence length: 15339 Attention mask shape: torch.Size([1, 1, 15339, 15339]) Position ids shape: torch.Size([1, 15339]) Input IDs shape: torch.Size([1, 15339]) Labels shape: torch.Size([1, 15339]) Final batch size: 1, sequence length: 19187 Attention mask shape: torch.Size([1, 1, 19187, 19187]) Position ids shape: torch.Size([1, 19187]) Input IDs shape: torch.Size([1, 19187]) Labels shape: torch.Size([1, 19187]) Final batch size: 1, sequence length: 24769 Attention mask shape: torch.Size([1, 1, 24769, 24769]) Position ids shape: torch.Size([1, 24769]) Input IDs shape: torch.Size([1, 24769]) Labels shape: torch.Size([1, 24769]) Final batch size: 1, sequence length: 10182 Attention mask shape: torch.Size([1, 1, 10182, 10182]) Position ids shape: torch.Size([1, 10182]) Input IDs shape: torch.Size([1, 10182]) Labels shape: torch.Size([1, 10182]) Final batch size: 1, sequence length: 24002 Attention mask shape: torch.Size([1, 1, 24002, 24002]) Position ids shape: torch.Size([1, 24002]) Input IDs shape: torch.Size([1, 24002]) Labels shape: torch.Size([1, 24002]) Final batch size: 1, sequence length: 21664 Attention mask shape: torch.Size([1, 1, 21664, 21664]) Position ids shape: torch.Size([1, 21664]) Input IDs shape: torch.Size([1, 21664]) Labels shape: torch.Size([1, 21664]) Final batch size: 1, sequence length: 22857 Attention mask shape: torch.Size([1, 1, 22857, 22857]) Position ids shape: torch.Size([1, 22857]) Input IDs shape: torch.Size([1, 22857]) Labels shape: torch.Size([1, 22857]) Final batch size: 1, sequence length: 26144 Attention mask shape: torch.Size([1, 1, 26144, 26144]) Position ids shape: torch.Size([1, 26144]) Input IDs shape: torch.Size([1, 26144]) Labels shape: torch.Size([1, 26144]) Final batch size: 1, sequence length: 24428 Attention mask shape: torch.Size([1, 1, 24428, 24428]) Position ids shape: torch.Size([1, 24428]) Input IDs shape: torch.Size([1, 24428]) Labels shape: torch.Size([1, 24428]) Final batch size: 1, sequence length: 14556 Attention mask shape: torch.Size([1, 1, 14556, 14556]) Position ids shape: torch.Size([1, 14556]) Input IDs shape: torch.Size([1, 14556]) Labels shape: torch.Size([1, 14556]) Final batch size: 1, sequence length: 24858 Attention mask shape: torch.Size([1, 1, 24858, 24858]) Position ids shape: torch.Size([1, 24858]) Input IDs shape: torch.Size([1, 24858]) Labels shape: torch.Size([1, 24858]) Final batch size: 1, sequence length: 26312 Attention mask shape: torch.Size([1, 1, 26312, 26312]) Position ids shape: torch.Size([1, 26312]) Input IDs shape: torch.Size([1, 26312]) Labels shape: torch.Size([1, 26312]) Final batch size: 1, sequence length: 16541 Attention mask shape: torch.Size([1, 1, 16541, 16541]) Position ids shape: torch.Size([1, 16541]) Input IDs shape: torch.Size([1, 16541]) Labels shape: torch.Size([1, 16541]) Final batch size: 1, sequence length: 25707 Attention mask shape: torch.Size([1, 1, 25707, 25707]) Position ids shape: torch.Size([1, 25707]) Input IDs shape: torch.Size([1, 25707]) Labels shape: torch.Size([1, 25707]) Final batch size: 1, sequence length: 25758 Attention mask shape: torch.Size([1, 1, 25758, 25758]) Position ids shape: torch.Size([1, 25758]) Input IDs shape: torch.Size([1, 25758]) Labels shape: torch.Size([1, 25758]) Final batch size: 1, sequence length: 26033 Attention mask shape: torch.Size([1, 1, 26033, 26033]) Position ids shape: torch.Size([1, 26033]) Input IDs shape: torch.Size([1, 26033]) Labels shape: torch.Size([1, 26033]) Final batch size: 1, sequence length: 23602 Attention mask shape: torch.Size([1, 1, 23602, 23602]) Position ids shape: torch.Size([1, 23602]) Input IDs shape: torch.Size([1, 23602]) Labels shape: torch.Size([1, 23602]) Final batch size: 1, sequence length: 25886 Attention mask shape: torch.Size([1, 1, 25886, 25886]) Position ids shape: torch.Size([1, 25886]) Input IDs shape: torch.Size([1, 25886]) Labels shape: torch.Size([1, 25886]) Final batch size: 1, sequence length: 30079 Attention mask shape: torch.Size([1, 1, 30079, 30079]) Position ids shape: torch.Size([1, 30079]) Input IDs shape: torch.Size([1, 30079]) Labels shape: torch.Size([1, 30079]) Final batch size: 1, sequence length: 10318 Attention mask shape: torch.Size([1, 1, 10318, 10318]) Position ids shape: torch.Size([1, 10318]) Input IDs shape: torch.Size([1, 10318]) Labels shape: torch.Size([1, 10318]) Final batch size: 1, sequence length: 29948 Attention mask shape: torch.Size([1, 1, 29948, 29948]) Position ids shape: torch.Size([1, 29948]) Input IDs shape: torch.Size([1, 29948]) Labels shape: torch.Size([1, 29948]) Final batch size: 1, sequence length: 23698 Attention mask shape: torch.Size([1, 1, 23698, 23698]) Position ids shape: torch.Size([1, 23698]) Input IDs shape: torch.Size([1, 23698]) Labels shape: torch.Size([1, 23698]) Final batch size: 1, sequence length: 25719 Attention mask shape: torch.Size([1, 1, 25719, 25719]) Position ids shape: torch.Size([1, 25719]) Input IDs shape: torch.Size([1, 25719]) Labels shape: torch.Size([1, 25719]) Final batch size: 1, sequence length: 29825 Attention mask shape: torch.Size([1, 1, 29825, 29825]) Position ids shape: torch.Size([1, 29825]) Input IDs shape: torch.Size([1, 29825]) Labels shape: torch.Size([1, 29825]) Final batch size: 1, sequence length: 21374 Attention mask shape: torch.Size([1, 1, 21374, 21374]) Position ids shape: torch.Size([1, 21374]) Input IDs shape: torch.Size([1, 21374]) Labels shape: torch.Size([1, 21374]) Final batch size: 1, sequence length: 25656 Attention mask shape: torch.Size([1, 1, 25656, 25656]) Position ids shape: torch.Size([1, 25656]) Input IDs shape: torch.Size([1, 25656]) Labels shape: torch.Size([1, 25656]) Final batch size: 1, sequence length: 32541 Attention mask shape: torch.Size([1, 1, 32541, 32541]) Position ids shape: torch.Size([1, 32541]) Input IDs shape: torch.Size([1, 32541]) Labels shape: torch.Size([1, 32541]) Final batch size: 1, sequence length: 29749 Attention mask shape: torch.Size([1, 1, 29749, 29749]) Position ids shape: torch.Size([1, 29749]) Input IDs shape: torch.Size([1, 29749]) Labels shape: torch.Size([1, 29749]) Final batch size: 1, sequence length: 32071 Attention mask shape: torch.Size([1, 1, 32071, 32071]) Position ids shape: torch.Size([1, 32071]) Input IDs shape: torch.Size([1, 32071]) Labels shape: torch.Size([1, 32071]) Final batch size: 1, sequence length: 25492 Attention mask shape: torch.Size([1, 1, 25492, 25492]) Position ids shape: torch.Size([1, 25492]) Input IDs shape: torch.Size([1, 25492]) Labels shape: torch.Size([1, 25492]) Final batch size: 1, sequence length: 33611 Attention mask shape: torch.Size([1, 1, 33611, 33611]) Position ids shape: torch.Size([1, 33611]) Input IDs shape: torch.Size([1, 33611]) Labels shape: torch.Size([1, 33611]) Final batch size: 1, sequence length: 19238 Attention mask shape: torch.Size([1, 1, 19238, 19238]) Position ids shape: torch.Size([1, 19238]) Input IDs shape: torch.Size([1, 19238]) Labels shape: torch.Size([1, 19238]) Final batch size: 1, sequence length: 23399 Attention mask shape: torch.Size([1, 1, 23399, 23399]) Position ids shape: torch.Size([1, 23399]) Input IDs shape: torch.Size([1, 23399]) Labels shape: torch.Size([1, 23399]) Final batch size: 1, sequence length: 9897 Attention mask shape: torch.Size([1, 1, 9897, 9897]) Position ids shape: torch.Size([1, 9897]) Input IDs shape: torch.Size([1, 9897]) Labels shape: torch.Size([1, 9897]) Final batch size: 1, sequence length: 22139 Attention mask shape: torch.Size([1, 1, 22139, 22139]) Position ids shape: torch.Size([1, 22139]) Input IDs shape: torch.Size([1, 22139]) Labels shape: torch.Size([1, 22139]) Final batch size: 1, sequence length: 15604 Attention mask shape: torch.Size([1, 1, 15604, 15604]) Position ids shape: torch.Size([1, 15604]) Input IDs shape: torch.Size([1, 15604]) Labels shape: torch.Size([1, 15604]) Final batch size: 1, sequence length: 37241 Attention mask shape: torch.Size([1, 1, 37241, 37241]) Position ids shape: torch.Size([1, 37241]) Input IDs shape: torch.Size([1, 37241]) Labels shape: torch.Size([1, 37241]) Final batch size: 1, sequence length: 34701 Attention mask shape: torch.Size([1, 1, 34701, 34701]) Position ids shape: torch.Size([1, 34701]) Input IDs shape: torch.Size([1, 34701]) Labels shape: torch.Size([1, 34701]) Final batch size: 1, sequence length: 34186 Attention mask shape: torch.Size([1, 1, 34186, 34186]) Position ids shape: torch.Size([1, 34186]) Input IDs shape: torch.Size([1, 34186]) Labels shape: torch.Size([1, 34186]) Final batch size: 1, sequence length: 36778 Attention mask shape: torch.Size([1, 1, 36778, 36778]) Position ids shape: torch.Size([1, 36778]) Input IDs shape: torch.Size([1, 36778]) Labels shape: torch.Size([1, 36778]) Final batch size: 1, sequence length: 37728 Attention mask shape: torch.Size([1, 1, 37728, 37728]) Position ids shape: torch.Size([1, 37728]) Input IDs shape: torch.Size([1, 37728]) Labels shape: torch.Size([1, 37728]) Final batch size: 1, sequence length: 40593 Attention mask shape: torch.Size([1, 1, 40593, 40593]) Position ids shape: torch.Size([1, 40593]) Input IDs shape: torch.Size([1, 40593]) Labels shape: torch.Size([1, 40593]) Final batch size: 1, sequence length: 36456 Attention mask shape: torch.Size([1, 1, 36456, 36456]) Position ids shape: torch.Size([1, 36456]) Input IDs shape: torch.Size([1, 36456]) Labels shape: torch.Size([1, 36456]) Final batch size: 1, sequence length: 21557 Attention mask shape: torch.Size([1, 1, 21557, 21557]) Position ids shape: torch.Size([1, 21557]) Input IDs shape: torch.Size([1, 21557]) Labels shape: torch.Size([1, 21557]) Final batch size: 1, sequence length: 15294 Attention mask shape: torch.Size([1, 1, 15294, 15294]) Position ids shape: torch.Size([1, 15294]) Input IDs shape: torch.Size([1, 15294]) Labels shape: torch.Size([1, 15294]) Final batch size: 1, sequence length: 33459 Attention mask shape: torch.Size([1, 1, 33459, 33459]) Position ids shape: torch.Size([1, 33459]) Input IDs shape: torch.Size([1, 33459]) Labels shape: torch.Size([1, 33459]) Final batch size: 1, sequence length: 23343 Attention mask shape: torch.Size([1, 1, 23343, 23343]) Position ids shape: torch.Size([1, 23343]) Input IDs shape: torch.Size([1, 23343]) Labels shape: torch.Size([1, 23343]) Final batch size: 1, sequence length: 40147 Attention mask shape: torch.Size([1, 1, 40147, 40147]) Position ids shape: torch.Size([1, 40147]) Input IDs shape: torch.Size([1, 40147]) Labels shape: torch.Size([1, 40147]) Final batch size: 1, sequence length: 31650 Attention mask shape: torch.Size([1, 1, 31650, 31650]) Position ids shape: torch.Size([1, 31650]) Input IDs shape: torch.Size([1, 31650]) Labels shape: torch.Size([1, 31650]) Final batch size: 1, sequence length: 31323 Attention mask shape: torch.Size([1, 1, 31323, 31323]) Position ids shape: torch.Size([1, 31323]) Input IDs shape: torch.Size([1, 31323]) Labels shape: torch.Size([1, 31323]) Final batch size: 1, sequence length: 21955 Attention mask shape: torch.Size([1, 1, 21955, 21955]) Position ids shape: torch.Size([1, 21955]) Input IDs shape: torch.Size([1, 21955]) Labels shape: torch.Size([1, 21955]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 39760 Attention mask shape: torch.Size([1, 1, 39760, 39760]) Position ids shape: torch.Size([1, 39760]) Input IDs shape: torch.Size([1, 39760]) Labels shape: torch.Size([1, 39760]) Final batch size: 1, sequence length: 35760 Attention mask shape: torch.Size([1, 1, 35760, 35760]) Position ids shape: torch.Size([1, 35760]) Input IDs shape: torch.Size([1, 35760]) Labels shape: torch.Size([1, 35760]) Final batch size: 1, sequence length: 9445 Attention mask shape: torch.Size([1, 1, 9445, 9445]) Position ids shape: torch.Size([1, 9445]) Input IDs shape: torch.Size([1, 9445]) Labels shape: torch.Size([1, 9445]) Final batch size: 1, sequence length: 22491 Attention mask shape: torch.Size([1, 1, 22491, 22491]) Position ids shape: torch.Size([1, 22491]) Input IDs shape: torch.Size([1, 22491]) Labels shape: torch.Size([1, 22491]) Final batch size: 1, sequence length: 27283 Attention mask shape: torch.Size([1, 1, 27283, 27283]) Position ids shape: torch.Size([1, 27283]) Input IDs shape: torch.Size([1, 27283]) Labels shape: torch.Size([1, 27283]) Final batch size: 1, sequence length: 22434 Attention mask shape: torch.Size([1, 1, 22434, 22434]) Position ids shape: torch.Size([1, 22434]) Input IDs shape: torch.Size([1, 22434]) Labels shape: torch.Size([1, 22434]) Final batch size: 1, sequence length: 26664 Attention mask shape: torch.Size([1, 1, 26664, 26664]) Position ids shape: torch.Size([1, 26664]) Input IDs shape: torch.Size([1, 26664]) Labels shape: torch.Size([1, 26664]) Final batch size: 1, sequence length: 40507 Attention mask shape: torch.Size([1, 1, 40507, 40507]) Position ids shape: torch.Size([1, 40507]) Input IDs shape: torch.Size([1, 40507]) Labels shape: torch.Size([1, 40507]) Final batch size: 1, sequence length: 17646 Attention mask shape: torch.Size([1, 1, 17646, 17646]) Position ids shape: torch.Size([1, 17646]) Input IDs shape: torch.Size([1, 17646]) Labels shape: torch.Size([1, 17646]) Final batch size: 1, sequence length: 37183 Attention mask shape: torch.Size([1, 1, 37183, 37183]) Position ids shape: torch.Size([1, 37183]) Input IDs shape: torch.Size([1, 37183]) Labels shape: torch.Size([1, 37183]) Final batch size: 1, sequence length: 19867 Attention mask shape: torch.Size([1, 1, 19867, 19867]) Position ids shape: torch.Size([1, 19867]) Input IDs shape: torch.Size([1, 19867]) Labels shape: torch.Size([1, 19867]) Final batch size: 1, sequence length: 7681 Attention mask shape: torch.Size([1, 1, 7681, 7681]) Position ids shape: torch.Size([1, 7681]) Input IDs shape: torch.Size([1, 7681]) Labels shape: torch.Size([1, 7681]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 21489 Attention mask shape: torch.Size([1, 1, 21489, 21489]) Position ids shape: torch.Size([1, 21489]) Input IDs shape: torch.Size([1, 21489]) Labels shape: torch.Size([1, 21489]) Final batch size: 1, sequence length: 29082 Attention mask shape: torch.Size([1, 1, 29082, 29082]) Position ids shape: torch.Size([1, 29082]) Input IDs shape: torch.Size([1, 29082]) Labels shape: torch.Size([1, 29082]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 37775 Attention mask shape: torch.Size([1, 1, 37775, 37775]) Position ids shape: torch.Size([1, 37775]) Input IDs shape: torch.Size([1, 37775]) Labels shape: torch.Size([1, 37775]) Final batch size: 1, sequence length: 23014 Attention mask shape: torch.Size([1, 1, 23014, 23014]) Position ids shape: torch.Size([1, 23014]) Input IDs shape: torch.Size([1, 23014]) Labels shape: torch.Size([1, 23014]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 28661 Attention mask shape: torch.Size([1, 1, 28661, 28661]) Position ids shape: torch.Size([1, 28661]) Input IDs shape: torch.Size([1, 28661]) Labels shape: torch.Size([1, 28661]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 16273 Attention mask shape: torch.Size([1, 1, 16273, 16273]) Position ids shape: torch.Size([1, 16273]) Input IDs shape: torch.Size([1, 16273]) Labels shape: torch.Size([1, 16273]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 38686 Attention mask shape: torch.Size([1, 1, 38686, 38686]) Position ids shape: torch.Size([1, 38686]) Input IDs shape: torch.Size([1, 38686]) Labels shape: torch.Size([1, 38686]) Final batch size: 1, sequence length: 36579 Attention mask shape: torch.Size([1, 1, 36579, 36579]) Position ids shape: torch.Size([1, 36579]) Input IDs shape: torch.Size([1, 36579]) Labels shape: torch.Size([1, 36579]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 37091 Attention mask shape: torch.Size([1, 1, 37091, 37091]) Position ids shape: torch.Size([1, 37091]) Input IDs shape: torch.Size([1, 37091]) Labels shape: torch.Size([1, 37091]) Final batch size: 1, sequence length: 25914 Attention mask shape: torch.Size([1, 1, 25914, 25914]) Position ids shape: torch.Size([1, 25914]) Input IDs shape: torch.Size([1, 25914]) Labels shape: torch.Size([1, 25914]) Final batch size: 1, sequence length: 33263 Attention mask shape: torch.Size([1, 1, 33263, 33263]) Position ids shape: torch.Size([1, 33263]) Input IDs shape: torch.Size([1, 33263]) Labels shape: torch.Size([1, 33263]) Final batch size: 1, sequence length: 30364 Attention mask shape: torch.Size([1, 1, 30364, 30364]) Position ids shape: torch.Size([1, 30364]) Input IDs shape: torch.Size([1, 30364]) Labels shape: torch.Size([1, 30364]) Final batch size: 1, sequence length: 36297 Attention mask shape: torch.Size([1, 1, 36297, 36297]) Position ids shape: torch.Size([1, 36297]) Input IDs shape: torch.Size([1, 36297]) Labels shape: torch.Size([1, 36297]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) {'loss': 0.2678, 'grad_norm': 0.3168439355351538, 'learning_rate': 4.738320218785281e-06, 'num_tokens': -inf, 'epoch': 4.5} Final batch size: 1, sequence length: 6519 Attention mask shape: torch.Size([1, 1, 6519, 6519]) Position ids shape: torch.Size([1, 6519]) Input IDs shape: torch.Size([1, 6519]) Labels shape: torch.Size([1, 6519]) Final batch size: 1, sequence length: 5525 Attention mask shape: torch.Size([1, 1, 5525, 5525]) Position ids shape: torch.Size([1, 5525]) Input IDs shape: torch.Size([1, 5525]) Labels shape: torch.Size([1, 5525]) Final batch size: 1, sequence length: 9181 Attention mask shape: torch.Size([1, 1, 9181, 9181]) Position ids shape: torch.Size([1, 9181]) Input IDs shape: torch.Size([1, 9181]) Labels shape: torch.Size([1, 9181]) Final batch size: 1, sequence length: 10273 Attention mask shape: torch.Size([1, 1, 10273, 10273]) Position ids shape: torch.Size([1, 10273]) Input IDs shape: torch.Size([1, 10273]) Labels shape: torch.Size([1, 10273]) Final batch size: 1, sequence length: 13385 Attention mask shape: torch.Size([1, 1, 13385, 13385]) Position ids shape: torch.Size([1, 13385]) Input IDs shape: torch.Size([1, 13385]) Labels shape: torch.Size([1, 13385]) Final batch size: 1, sequence length: 12927 Attention mask shape: torch.Size([1, 1, 12927, 12927]) Position ids shape: torch.Size([1, 12927]) Input IDs shape: torch.Size([1, 12927]) Labels shape: torch.Size([1, 12927]) Final batch size: 1, sequence length: 10905 Attention mask shape: torch.Size([1, 1, 10905, 10905]) Position ids shape: torch.Size([1, 10905]) Input IDs shape: torch.Size([1, 10905]) Labels shape: torch.Size([1, 10905]) Final batch size: 1, sequence length: 10408 Attention mask shape: torch.Size([1, 1, 10408, 10408]) Position ids shape: torch.Size([1, 10408]) Input IDs shape: torch.Size([1, 10408]) Labels shape: torch.Size([1, 10408]) Final batch size: 1, sequence length: 13804 Attention mask shape: torch.Size([1, 1, 13804, 13804]) Position ids shape: torch.Size([1, 13804]) Input IDs shape: torch.Size([1, 13804]) Labels shape: torch.Size([1, 13804]) Final batch size: 1, sequence length: 12281 Attention mask shape: torch.Size([1, 1, 12281, 12281]) Position ids shape: torch.Size([1, 12281]) Input IDs shape: torch.Size([1, 12281]) Labels shape: torch.Size([1, 12281]) Final batch size: 1, sequence length: 13363 Attention mask shape: torch.Size([1, 1, 13363, 13363]) Position ids shape: torch.Size([1, 13363]) Input IDs shape: torch.Size([1, 13363]) Labels shape: torch.Size([1, 13363]) Final batch size: 1, sequence length: 15257 Attention mask shape: torch.Size([1, 1, 15257, 15257]) Position ids shape: torch.Size([1, 15257]) Input IDs shape: torch.Size([1, 15257]) Labels shape: torch.Size([1, 15257]) Final batch size: 1, sequence length: 15518 Attention mask shape: torch.Size([1, 1, 15518, 15518]) Position ids shape: torch.Size([1, 15518]) Input IDs shape: torch.Size([1, 15518]) Labels shape: torch.Size([1, 15518]) Final batch size: 1, sequence length: 17003 Attention mask shape: torch.Size([1, 1, 17003, 17003]) Position ids shape: torch.Size([1, 17003]) Input IDs shape: torch.Size([1, 17003]) Labels shape: torch.Size([1, 17003]) Final batch size: 1, sequence length: 19768 Attention mask shape: torch.Size([1, 1, 19768, 19768]) Position ids shape: torch.Size([1, 19768]) Input IDs shape: torch.Size([1, 19768]) Labels shape: torch.Size([1, 19768]) Final batch size: 1, sequence length: 16520 Attention mask shape: torch.Size([1, 1, 16520, 16520]) Position ids shape: torch.Size([1, 16520]) Input IDs shape: torch.Size([1, 16520]) Labels shape: torch.Size([1, 16520]) Final batch size: 1, sequence length: 8646 Attention mask shape: torch.Size([1, 1, 8646, 8646]) Position ids shape: torch.Size([1, 8646]) Input IDs shape: torch.Size([1, 8646]) Labels shape: torch.Size([1, 8646]) Final batch size: 1, sequence length: 15244 Attention mask shape: torch.Size([1, 1, 15244, 15244]) Position ids shape: torch.Size([1, 15244]) Input IDs shape: torch.Size([1, 15244]) Labels shape: torch.Size([1, 15244]) Final batch size: 1, sequence length: 20089 Attention mask shape: torch.Size([1, 1, 20089, 20089]) Position ids shape: torch.Size([1, 20089]) Input IDs shape: torch.Size([1, 20089]) Labels shape: torch.Size([1, 20089]) Final batch size: 1, sequence length: 18645 Attention mask shape: torch.Size([1, 1, 18645, 18645]) Position ids shape: torch.Size([1, 18645]) Input IDs shape: torch.Size([1, 18645]) Labels shape: torch.Size([1, 18645]) Final batch size: 1, sequence length: 20979 Attention mask shape: torch.Size([1, 1, 20979, 20979]) Position ids shape: torch.Size([1, 20979]) Input IDs shape: torch.Size([1, 20979]) Labels shape: torch.Size([1, 20979]) Final batch size: 1, sequence length: 20106 Attention mask shape: torch.Size([1, 1, 20106, 20106]) Position ids shape: torch.Size([1, 20106]) Input IDs shape: torch.Size([1, 20106]) Labels shape: torch.Size([1, 20106]) Final batch size: 1, sequence length: 18819 Attention mask shape: torch.Size([1, 1, 18819, 18819]) Position ids shape: torch.Size([1, 18819]) Input IDs shape: torch.Size([1, 18819]) Labels shape: torch.Size([1, 18819]) Final batch size: 1, sequence length: 23886 Attention mask shape: torch.Size([1, 1, 23886, 23886]) Position ids shape: torch.Size([1, 23886]) Input IDs shape: torch.Size([1, 23886]) Labels shape: torch.Size([1, 23886]) Final batch size: 1, sequence length: 22777 Attention mask shape: torch.Size([1, 1, 22777, 22777]) Position ids shape: torch.Size([1, 22777]) Input IDs shape: torch.Size([1, 22777]) Labels shape: torch.Size([1, 22777]) Final batch size: 1, sequence length: 22915 Attention mask shape: torch.Size([1, 1, 22915, 22915]) Position ids shape: torch.Size([1, 22915]) Input IDs shape: torch.Size([1, 22915]) Labels shape: torch.Size([1, 22915]) Final batch size: 1, sequence length: 23942 Attention mask shape: torch.Size([1, 1, 23942, 23942]) Position ids shape: torch.Size([1, 23942]) Input IDs shape: torch.Size([1, 23942]) Labels shape: torch.Size([1, 23942]) Final batch size: 1, sequence length: 19286 Attention mask shape: torch.Size([1, 1, 19286, 19286]) Position ids shape: torch.Size([1, 19286]) Input IDs shape: torch.Size([1, 19286]) Labels shape: torch.Size([1, 19286]) Final batch size: 1, sequence length: 19513 Attention mask shape: torch.Size([1, 1, 19513, 19513]) Position ids shape: torch.Size([1, 19513]) Input IDs shape: torch.Size([1, 19513]) Labels shape: torch.Size([1, 19513]) Final batch size: 1, sequence length: 23995 Attention mask shape: torch.Size([1, 1, 23995, 23995]) Position ids shape: torch.Size([1, 23995]) Input IDs shape: torch.Size([1, 23995]) Labels shape: torch.Size([1, 23995]) Final batch size: 1, sequence length: 22311 Attention mask shape: torch.Size([1, 1, 22311, 22311]) Position ids shape: torch.Size([1, 22311]) Input IDs shape: torch.Size([1, 22311]) Labels shape: torch.Size([1, 22311]) Final batch size: 1, sequence length: 22160 Attention mask shape: torch.Size([1, 1, 22160, 22160]) Position ids shape: torch.Size([1, 22160]) Input IDs shape: torch.Size([1, 22160]) Labels shape: torch.Size([1, 22160]) Final batch size: 1, sequence length: 14429 Attention mask shape: torch.Size([1, 1, 14429, 14429]) Position ids shape: torch.Size([1, 14429]) Input IDs shape: torch.Size([1, 14429]) Labels shape: torch.Size([1, 14429]) Final batch size: 1, sequence length: 22979 Attention mask shape: torch.Size([1, 1, 22979, 22979]) Position ids shape: torch.Size([1, 22979]) Input IDs shape: torch.Size([1, 22979]) Labels shape: torch.Size([1, 22979]) Final batch size: 1, sequence length: 5405 Attention mask shape: torch.Size([1, 1, 5405, 5405]) Position ids shape: torch.Size([1, 5405]) Input IDs shape: torch.Size([1, 5405]) Labels shape: torch.Size([1, 5405]) Final batch size: 1, sequence length: 25001 Attention mask shape: torch.Size([1, 1, 25001, 25001]) Position ids shape: torch.Size([1, 25001]) Input IDs shape: torch.Size([1, 25001]) Labels shape: torch.Size([1, 25001]) Final batch size: 1, sequence length: 24293 Attention mask shape: torch.Size([1, 1, 24293, 24293]) Position ids shape: torch.Size([1, 24293]) Input IDs shape: torch.Size([1, 24293]) Labels shape: torch.Size([1, 24293]) Final batch size: 1, sequence length: 21660 Attention mask shape: torch.Size([1, 1, 21660, 21660]) Position ids shape: torch.Size([1, 21660]) Input IDs shape: torch.Size([1, 21660]) Labels shape: torch.Size([1, 21660]) Final batch size: 1, sequence length: 18988 Attention mask shape: torch.Size([1, 1, 18988, 18988]) Position ids shape: torch.Size([1, 18988]) Input IDs shape: torch.Size([1, 18988]) Labels shape: torch.Size([1, 18988]) Final batch size: 1, sequence length: 25021 Attention mask shape: torch.Size([1, 1, 25021, 25021]) Position ids shape: torch.Size([1, 25021]) Input IDs shape: torch.Size([1, 25021]) Labels shape: torch.Size([1, 25021]) Final batch size: 1, sequence length: 23341 Attention mask shape: torch.Size([1, 1, 23341, 23341]) Position ids shape: torch.Size([1, 23341]) Input IDs shape: torch.Size([1, 23341]) Labels shape: torch.Size([1, 23341]) Final batch size: 1, sequence length: 5288 Attention mask shape: torch.Size([1, 1, 5288, 5288]) Position ids shape: torch.Size([1, 5288]) Input IDs shape: torch.Size([1, 5288]) Labels shape: torch.Size([1, 5288]) Final batch size: 1, sequence length: 24287 Attention mask shape: torch.Size([1, 1, 24287, 24287]) Position ids shape: torch.Size([1, 24287]) Input IDs shape: torch.Size([1, 24287]) Labels shape: torch.Size([1, 24287]) Final batch size: 1, sequence length: 12483 Attention mask shape: torch.Size([1, 1, 12483, 12483]) Position ids shape: torch.Size([1, 12483]) Input IDs shape: torch.Size([1, 12483]) Labels shape: torch.Size([1, 12483]) Final batch size: 1, sequence length: 13600 Attention mask shape: torch.Size([1, 1, 13600, 13600]) Position ids shape: torch.Size([1, 13600]) Input IDs shape: torch.Size([1, 13600]) Labels shape: torch.Size([1, 13600]) Final batch size: 1, sequence length: 23724 Attention mask shape: torch.Size([1, 1, 23724, 23724]) Position ids shape: torch.Size([1, 23724]) Input IDs shape: torch.Size([1, 23724]) Labels shape: torch.Size([1, 23724]) Final batch size: 1, sequence length: 11266 Attention mask shape: torch.Size([1, 1, 11266, 11266]) Position ids shape: torch.Size([1, 11266]) Input IDs shape: torch.Size([1, 11266]) Labels shape: torch.Size([1, 11266]) Final batch size: 1, sequence length: 24407 Attention mask shape: torch.Size([1, 1, 24407, 24407]) Position ids shape: torch.Size([1, 24407]) Input IDs shape: torch.Size([1, 24407]) Labels shape: torch.Size([1, 24407]) Final batch size: 1, sequence length: 20941 Attention mask shape: torch.Size([1, 1, 20941, 20941]) Position ids shape: torch.Size([1, 20941]) Input IDs shape: torch.Size([1, 20941]) Labels shape: torch.Size([1, 20941]) Final batch size: 1, sequence length: 26179 Attention mask shape: torch.Size([1, 1, 26179, 26179]) Position ids shape: torch.Size([1, 26179]) Input IDs shape: torch.Size([1, 26179]) Labels shape: torch.Size([1, 26179]) Final batch size: 1, sequence length: 29009 Attention mask shape: torch.Size([1, 1, 29009, 29009]) Position ids shape: torch.Size([1, 29009]) Input IDs shape: torch.Size([1, 29009]) Labels shape: torch.Size([1, 29009]) Final batch size: 1, sequence length: 30723 Attention mask shape: torch.Size([1, 1, 30723, 30723]) Position ids shape: torch.Size([1, 30723]) Input IDs shape: torch.Size([1, 30723]) Labels shape: torch.Size([1, 30723]) Final batch size: 1, sequence length: 26054 Attention mask shape: torch.Size([1, 1, 26054, 26054]) Position ids shape: torch.Size([1, 26054]) Input IDs shape: torch.Size([1, 26054]) Labels shape: torch.Size([1, 26054]) Final batch size: 1, sequence length: 26461 Attention mask shape: torch.Size([1, 1, 26461, 26461]) Position ids shape: torch.Size([1, 26461]) Input IDs shape: torch.Size([1, 26461]) Labels shape: torch.Size([1, 26461]) Final batch size: 1, sequence length: 30236 Attention mask shape: torch.Size([1, 1, 30236, 30236]) Position ids shape: torch.Size([1, 30236]) Input IDs shape: torch.Size([1, 30236]) Labels shape: torch.Size([1, 30236]) Final batch size: 1, sequence length: 28440 Attention mask shape: torch.Size([1, 1, 28440, 28440]) Position ids shape: torch.Size([1, 28440]) Input IDs shape: torch.Size([1, 28440]) Labels shape: torch.Size([1, 28440]) Final batch size: 1, sequence length: 26886 Attention mask shape: torch.Size([1, 1, 26886, 26886]) Position ids shape: torch.Size([1, 26886]) Input IDs shape: torch.Size([1, 26886]) Labels shape: torch.Size([1, 26886]) Final batch size: 1, sequence length: 17951 Attention mask shape: torch.Size([1, 1, 17951, 17951]) Position ids shape: torch.Size([1, 17951]) Input IDs shape: torch.Size([1, 17951]) Labels shape: torch.Size([1, 17951]) Final batch size: 1, sequence length: 32083 Attention mask shape: torch.Size([1, 1, 32083, 32083]) Position ids shape: torch.Size([1, 32083]) Input IDs shape: torch.Size([1, 32083]) Labels shape: torch.Size([1, 32083]) Final batch size: 1, sequence length: 32799 Attention mask shape: torch.Size([1, 1, 32799, 32799]) Position ids shape: torch.Size([1, 32799]) Input IDs shape: torch.Size([1, 32799]) Labels shape: torch.Size([1, 32799]) Final batch size: 1, sequence length: 21472 Attention mask shape: torch.Size([1, 1, 21472, 21472]) Position ids shape: torch.Size([1, 21472]) Input IDs shape: torch.Size([1, 21472]) Labels shape: torch.Size([1, 21472]) Final batch size: 1, sequence length: 32660 Attention mask shape: torch.Size([1, 1, 32660, 32660]) Position ids shape: torch.Size([1, 32660]) Input IDs shape: torch.Size([1, 32660]) Labels shape: torch.Size([1, 32660]) Final batch size: 1, sequence length: 17649 Attention mask shape: torch.Size([1, 1, 17649, 17649]) Position ids shape: torch.Size([1, 17649]) Input IDs shape: torch.Size([1, 17649]) Labels shape: torch.Size([1, 17649]) Final batch size: 1, sequence length: 27405 Attention mask shape: torch.Size([1, 1, 27405, 27405]) Position ids shape: torch.Size([1, 27405]) Input IDs shape: torch.Size([1, 27405]) Labels shape: torch.Size([1, 27405]) Final batch size: 1, sequence length: 17539 Attention mask shape: torch.Size([1, 1, 17539, 17539]) Position ids shape: torch.Size([1, 17539]) Input IDs shape: torch.Size([1, 17539]) Labels shape: torch.Size([1, 17539]) Final batch size: 1, sequence length: 29109 Attention mask shape: torch.Size([1, 1, 29109, 29109]) Position ids shape: torch.Size([1, 29109]) Input IDs shape: torch.Size([1, 29109]) Labels shape: torch.Size([1, 29109]) Final batch size: 1, sequence length: 28263 Attention mask shape: torch.Size([1, 1, 28263, 28263]) Position ids shape: torch.Size([1, 28263]) Input IDs shape: torch.Size([1, 28263]) Labels shape: torch.Size([1, 28263]) Final batch size: 1, sequence length: 31512 Attention mask shape: torch.Size([1, 1, 31512, 31512]) Position ids shape: torch.Size([1, 31512]) Input IDs shape: torch.Size([1, 31512]) Labels shape: torch.Size([1, 31512]) Final batch size: 1, sequence length: 29561 Attention mask shape: torch.Size([1, 1, 29561, 29561]) Position ids shape: torch.Size([1, 29561]) Input IDs shape: torch.Size([1, 29561]) Labels shape: torch.Size([1, 29561]) Final batch size: 1, sequence length: 27987 Attention mask shape: torch.Size([1, 1, 27987, 27987]) Position ids shape: torch.Size([1, 27987]) Input IDs shape: torch.Size([1, 27987]) Labels shape: torch.Size([1, 27987]) Final batch size: 1, sequence length: 35478 Attention mask shape: torch.Size([1, 1, 35478, 35478]) Position ids shape: torch.Size([1, 35478]) Input IDs shape: torch.Size([1, 35478]) Labels shape: torch.Size([1, 35478]) Final batch size: 1, sequence length: 29237 Attention mask shape: torch.Size([1, 1, 29237, 29237]) Position ids shape: torch.Size([1, 29237]) Input IDs shape: torch.Size([1, 29237]) Labels shape: torch.Size([1, 29237]) Final batch size: 1, sequence length: 29152 Attention mask shape: torch.Size([1, 1, 29152, 29152]) Position ids shape: torch.Size([1, 29152]) Input IDs shape: torch.Size([1, 29152]) Labels shape: torch.Size([1, 29152]) Final batch size: 1, sequence length: 34694 Attention mask shape: torch.Size([1, 1, 34694, 34694]) Position ids shape: torch.Size([1, 34694]) Input IDs shape: torch.Size([1, 34694]) Labels shape: torch.Size([1, 34694]) Final batch size: 1, sequence length: 32287 Attention mask shape: torch.Size([1, 1, 32287, 32287]) Position ids shape: torch.Size([1, 32287]) Input IDs shape: torch.Size([1, 32287]) Labels shape: torch.Size([1, 32287]) Final batch size: 1, sequence length: 26452 Attention mask shape: torch.Size([1, 1, 26452, 26452]) Position ids shape: torch.Size([1, 26452]) Input IDs shape: torch.Size([1, 26452]) Labels shape: torch.Size([1, 26452]) Final batch size: 1, sequence length: 24056 Attention mask shape: torch.Size([1, 1, 24056, 24056]) Position ids shape: torch.Size([1, 24056]) Input IDs shape: torch.Size([1, 24056]) Labels shape: torch.Size([1, 24056]) Final batch size: 1, sequence length: 15830 Attention mask shape: torch.Size([1, 1, 15830, 15830]) Position ids shape: torch.Size([1, 15830]) Input IDs shape: torch.Size([1, 15830]) Labels shape: torch.Size([1, 15830]) Final batch size: 1, sequence length: 8008 Attention mask shape: torch.Size([1, 1, 8008, 8008]) Position ids shape: torch.Size([1, 8008]) Input IDs shape: torch.Size([1, 8008]) Labels shape: torch.Size([1, 8008]) Final batch size: 1, sequence length: 17775 Attention mask shape: torch.Size([1, 1, 17775, 17775]) Position ids shape: torch.Size([1, 17775]) Input IDs shape: torch.Size([1, 17775]) Labels shape: torch.Size([1, 17775]) Final batch size: 1, sequence length: 29880 Attention mask shape: torch.Size([1, 1, 29880, 29880]) Position ids shape: torch.Size([1, 29880]) Input IDs shape: torch.Size([1, 29880]) Labels shape: torch.Size([1, 29880]) Final batch size: 1, sequence length: 15906 Attention mask shape: torch.Size([1, 1, 15906, 15906]) Position ids shape: torch.Size([1, 15906]) Input IDs shape: torch.Size([1, 15906]) Labels shape: torch.Size([1, 15906]) Final batch size: 1, sequence length: 23945 Attention mask shape: torch.Size([1, 1, 23945, 23945]) Position ids shape: torch.Size([1, 23945]) Input IDs shape: torch.Size([1, 23945]) Labels shape: torch.Size([1, 23945]) Final batch size: 1, sequence length: 34777 Attention mask shape: torch.Size([1, 1, 34777, 34777]) Position ids shape: torch.Size([1, 34777]) Input IDs shape: torch.Size([1, 34777]) Labels shape: torch.Size([1, 34777]) Final batch size: 1, sequence length: 7448 Attention mask shape: torch.Size([1, 1, 7448, 7448]) Position ids shape: torch.Size([1, 7448]) Input IDs shape: torch.Size([1, 7448]) Labels shape: torch.Size([1, 7448]) Final batch size: 1, sequence length: 35696 Attention mask shape: torch.Size([1, 1, 35696, 35696]) Position ids shape: torch.Size([1, 35696]) Input IDs shape: torch.Size([1, 35696]) Labels shape: torch.Size([1, 35696]) Final batch size: 1, sequence length: 16852 Attention mask shape: torch.Size([1, 1, 16852, 16852]) Position ids shape: torch.Size([1, 16852]) Input IDs shape: torch.Size([1, 16852]) Labels shape: torch.Size([1, 16852]) Final batch size: 1, sequence length: 39476 Attention mask shape: torch.Size([1, 1, 39476, 39476]) Position ids shape: torch.Size([1, 39476]) Input IDs shape: torch.Size([1, 39476]) Labels shape: torch.Size([1, 39476]) Final batch size: 1, sequence length: 16631 Attention mask shape: torch.Size([1, 1, 16631, 16631]) Position ids shape: torch.Size([1, 16631]) Input IDs shape: torch.Size([1, 16631]) Labels shape: torch.Size([1, 16631]) Final batch size: 1, sequence length: 29262 Attention mask shape: torch.Size([1, 1, 29262, 29262]) Position ids shape: torch.Size([1, 29262]) Input IDs shape: torch.Size([1, 29262]) Labels shape: torch.Size([1, 29262]) Final batch size: 1, sequence length: 31786 Attention mask shape: torch.Size([1, 1, 31786, 31786]) Position ids shape: torch.Size([1, 31786]) Input IDs shape: torch.Size([1, 31786]) Labels shape: torch.Size([1, 31786]) Final batch size: 1, sequence length: 37933 Attention mask shape: torch.Size([1, 1, 37933, 37933]) Position ids shape: torch.Size([1, 37933]) Input IDs shape: torch.Size([1, 37933]) Labels shape: torch.Size([1, 37933]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 28879 Attention mask shape: torch.Size([1, 1, 28879, 28879]) Position ids shape: torch.Size([1, 28879]) Input IDs shape: torch.Size([1, 28879]) Labels shape: torch.Size([1, 28879]) Final batch size: 1, sequence length: 36314 Attention mask shape: torch.Size([1, 1, 36314, 36314]) Position ids shape: torch.Size([1, 36314]) Input IDs shape: torch.Size([1, 36314]) Labels shape: torch.Size([1, 36314]) Final batch size: 1, sequence length: 28691 Attention mask shape: torch.Size([1, 1, 28691, 28691]) Position ids shape: torch.Size([1, 28691]) Input IDs shape: torch.Size([1, 28691]) Labels shape: torch.Size([1, 28691]) Final batch size: 1, sequence length: 35526 Attention mask shape: torch.Size([1, 1, 35526, 35526]) Position ids shape: torch.Size([1, 35526]) Input IDs shape: torch.Size([1, 35526]) Labels shape: torch.Size([1, 35526]) Final batch size: 1, sequence length: 35999 Attention mask shape: torch.Size([1, 1, 35999, 35999]) Position ids shape: torch.Size([1, 35999]) Input IDs shape: torch.Size([1, 35999]) Labels shape: torch.Size([1, 35999]) Final batch size: 1, sequence length: 10480 Attention mask shape: torch.Size([1, 1, 10480, 10480]) Position ids shape: torch.Size([1, 10480]) Input IDs shape: torch.Size([1, 10480]) Labels shape: torch.Size([1, 10480]) Final batch size: 1, sequence length: 24824 Attention mask shape: torch.Size([1, 1, 24824, 24824]) Position ids shape: torch.Size([1, 24824]) Input IDs shape: torch.Size([1, 24824]) Labels shape: torch.Size([1, 24824]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 31132 Attention mask shape: torch.Size([1, 1, 31132, 31132]) Position ids shape: torch.Size([1, 31132]) Input IDs shape: torch.Size([1, 31132]) Labels shape: torch.Size([1, 31132]) Final batch size: 1, sequence length: 28495 Attention mask shape: torch.Size([1, 1, 28495, 28495]) Position ids shape: torch.Size([1, 28495]) Input IDs shape: torch.Size([1, 28495]) Labels shape: torch.Size([1, 28495]) Final batch size: 1, sequence length: 20176 Attention mask shape: torch.Size([1, 1, 20176, 20176]) Position ids shape: torch.Size([1, 20176]) Input IDs shape: torch.Size([1, 20176]) Labels shape: torch.Size([1, 20176]) Final batch size: 1, sequence length: 20694 Attention mask shape: torch.Size([1, 1, 20694, 20694]) Position ids shape: torch.Size([1, 20694]) Input IDs shape: torch.Size([1, 20694]) Labels shape: torch.Size([1, 20694]) Final batch size: 1, sequence length: 6378 Attention mask shape: torch.Size([1, 1, 6378, 6378]) Position ids shape: torch.Size([1, 6378]) Input IDs shape: torch.Size([1, 6378]) Labels shape: torch.Size([1, 6378]) Final batch size: 1, sequence length: 11395 Attention mask shape: torch.Size([1, 1, 11395, 11395]) Position ids shape: torch.Size([1, 11395]) Input IDs shape: torch.Size([1, 11395]) Labels shape: torch.Size([1, 11395]) Final batch size: 1, sequence length: 14869 Attention mask shape: torch.Size([1, 1, 14869, 14869]) Position ids shape: torch.Size([1, 14869]) Input IDs shape: torch.Size([1, 14869]) Labels shape: torch.Size([1, 14869]) Final batch size: 1, sequence length: 18911 Attention mask shape: torch.Size([1, 1, 18911, 18911]) Position ids shape: torch.Size([1, 18911]) Input IDs shape: torch.Size([1, 18911]) Labels shape: torch.Size([1, 18911]) Final batch size: 1, sequence length: 18127 Attention mask shape: torch.Size([1, 1, 18127, 18127]) Position ids shape: torch.Size([1, 18127]) Input IDs shape: torch.Size([1, 18127]) Labels shape: torch.Size([1, 18127]) Final batch size: 1, sequence length: 38935 Attention mask shape: torch.Size([1, 1, 38935, 38935]) Position ids shape: torch.Size([1, 38935]) Input IDs shape: torch.Size([1, 38935]) Labels shape: torch.Size([1, 38935]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40605 Attention mask shape: torch.Size([1, 1, 40605, 40605]) Position ids shape: torch.Size([1, 40605]) Input IDs shape: torch.Size([1, 40605]) Labels shape: torch.Size([1, 40605]) Final batch size: 1, sequence length: 35116 Attention mask shape: torch.Size([1, 1, 35116, 35116]) Position ids shape: torch.Size([1, 35116]) Input IDs shape: torch.Size([1, 35116]) Labels shape: torch.Size([1, 35116]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 28313 Attention mask shape: torch.Size([1, 1, 28313, 28313]) Position ids shape: torch.Size([1, 28313]) Input IDs shape: torch.Size([1, 28313]) Labels shape: torch.Size([1, 28313]) Final batch size: 1, sequence length: 39687 Attention mask shape: torch.Size([1, 1, 39687, 39687]) Position ids shape: torch.Size([1, 39687]) Input IDs shape: torch.Size([1, 39687]) Labels shape: torch.Size([1, 39687]) Final batch size: 1, sequence length: 36469 Attention mask shape: torch.Size([1, 1, 36469, 36469]) Position ids shape: torch.Size([1, 36469]) Input IDs shape: torch.Size([1, 36469]) Labels shape: torch.Size([1, 36469]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 30024 Attention mask shape: torch.Size([1, 1, 30024, 30024]) Position ids shape: torch.Size([1, 30024]) Input IDs shape: torch.Size([1, 30024]) Labels shape: torch.Size([1, 30024]) Final batch size: 1, sequence length: 37159 Attention mask shape: torch.Size([1, 1, 37159, 37159]) Position ids shape: torch.Size([1, 37159]) Input IDs shape: torch.Size([1, 37159]) Labels shape: torch.Size([1, 37159]) Final batch size: 1, sequence length: 39324 Attention mask shape: torch.Size([1, 1, 39324, 39324]) Position ids shape: torch.Size([1, 39324]) Input IDs shape: torch.Size([1, 39324]) Labels shape: torch.Size([1, 39324]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 33070 Attention mask shape: torch.Size([1, 1, 33070, 33070]) Position ids shape: torch.Size([1, 33070]) Input IDs shape: torch.Size([1, 33070]) Labels shape: torch.Size([1, 33070]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 30245 Attention mask shape: torch.Size([1, 1, 30245, 30245]) Position ids shape: torch.Size([1, 30245]) Input IDs shape: torch.Size([1, 30245]) Labels shape: torch.Size([1, 30245]) Final batch size: 1, sequence length: 26402 Attention mask shape: torch.Size([1, 1, 26402, 26402]) Position ids shape: torch.Size([1, 26402]) Input IDs shape: torch.Size([1, 26402]) Labels shape: torch.Size([1, 26402]) {'loss': 0.25, 'grad_norm': 0.2900237388806637, 'learning_rate': 4.477357683661734e-06, 'num_tokens': -inf, 'epoch': 4.62} Final batch size: 1, sequence length: 3968 Attention mask shape: torch.Size([1, 1, 3968, 3968]) Position ids shape: torch.Size([1, 3968]) Input IDs shape: torch.Size([1, 3968]) Labels shape: torch.Size([1, 3968]) Final batch size: 1, sequence length: 6567 Attention mask shape: torch.Size([1, 1, 6567, 6567]) Position ids shape: torch.Size([1, 6567]) Input IDs shape: torch.Size([1, 6567]) Labels shape: torch.Size([1, 6567]) Final batch size: 1, sequence length: 8347 Attention mask shape: torch.Size([1, 1, 8347, 8347]) Position ids shape: torch.Size([1, 8347]) Input IDs shape: torch.Size([1, 8347]) Labels shape: torch.Size([1, 8347]) Final batch size: 1, sequence length: 5997 Attention mask shape: torch.Size([1, 1, 5997, 5997]) Position ids shape: torch.Size([1, 5997]) Input IDs shape: torch.Size([1, 5997]) Labels shape: torch.Size([1, 5997]) Final batch size: 1, sequence length: 9027 Attention mask shape: torch.Size([1, 1, 9027, 9027]) Position ids shape: torch.Size([1, 9027]) Input IDs shape: torch.Size([1, 9027]) Labels shape: torch.Size([1, 9027]) Final batch size: 1, sequence length: 11752 Attention mask shape: torch.Size([1, 1, 11752, 11752]) Position ids shape: torch.Size([1, 11752]) Input IDs shape: torch.Size([1, 11752]) Labels shape: torch.Size([1, 11752]) Final batch size: 1, sequence length: 11232 Attention mask shape: torch.Size([1, 1, 11232, 11232]) Position ids shape: torch.Size([1, 11232]) Input IDs shape: torch.Size([1, 11232]) Labels shape: torch.Size([1, 11232]) Final batch size: 1, sequence length: 11054 Attention mask shape: torch.Size([1, 1, 11054, 11054]) Position ids shape: torch.Size([1, 11054]) Input IDs shape: torch.Size([1, 11054]) Labels shape: torch.Size([1, 11054]) Final batch size: 1, sequence length: 12967 Attention mask shape: torch.Size([1, 1, 12967, 12967]) Position ids shape: torch.Size([1, 12967]) Input IDs shape: torch.Size([1, 12967]) Labels shape: torch.Size([1, 12967]) Final batch size: 1, sequence length: 14758 Attention mask shape: torch.Size([1, 1, 14758, 14758]) Position ids shape: torch.Size([1, 14758]) Input IDs shape: torch.Size([1, 14758]) Labels shape: torch.Size([1, 14758]) Final batch size: 1, sequence length: 13260 Attention mask shape: torch.Size([1, 1, 13260, 13260]) Position ids shape: torch.Size([1, 13260]) Input IDs shape: torch.Size([1, 13260]) Labels shape: torch.Size([1, 13260]) Final batch size: 1, sequence length: 11880 Attention mask shape: torch.Size([1, 1, 11880, 11880]) Position ids shape: torch.Size([1, 11880]) Input IDs shape: torch.Size([1, 11880]) Labels shape: torch.Size([1, 11880]) Final batch size: 1, sequence length: 14512 Attention mask shape: torch.Size([1, 1, 14512, 14512]) Position ids shape: torch.Size([1, 14512]) Input IDs shape: torch.Size([1, 14512]) Labels shape: torch.Size([1, 14512]) Final batch size: 1, sequence length: 16014 Attention mask shape: torch.Size([1, 1, 16014, 16014]) Position ids shape: torch.Size([1, 16014]) Input IDs shape: torch.Size([1, 16014]) Labels shape: torch.Size([1, 16014]) Final batch size: 1, sequence length: 17166 Attention mask shape: torch.Size([1, 1, 17166, 17166]) Position ids shape: torch.Size([1, 17166]) Input IDs shape: torch.Size([1, 17166]) Labels shape: torch.Size([1, 17166]) Final batch size: 1, sequence length: 12370 Attention mask shape: torch.Size([1, 1, 12370, 12370]) Position ids shape: torch.Size([1, 12370]) Input IDs shape: torch.Size([1, 12370]) Labels shape: torch.Size([1, 12370]) Final batch size: 1, sequence length: 17092 Attention mask shape: torch.Size([1, 1, 17092, 17092]) Position ids shape: torch.Size([1, 17092]) Input IDs shape: torch.Size([1, 17092]) Labels shape: torch.Size([1, 17092]) Final batch size: 1, sequence length: 14520 Attention mask shape: torch.Size([1, 1, 14520, 14520]) Position ids shape: torch.Size([1, 14520]) Input IDs shape: torch.Size([1, 14520]) Labels shape: torch.Size([1, 14520]) Final batch size: 1, sequence length: 17839 Attention mask shape: torch.Size([1, 1, 17839, 17839]) Position ids shape: torch.Size([1, 17839]) Input IDs shape: torch.Size([1, 17839]) Labels shape: torch.Size([1, 17839]) Final batch size: 1, sequence length: 14597 Attention mask shape: torch.Size([1, 1, 14597, 14597]) Position ids shape: torch.Size([1, 14597]) Input IDs shape: torch.Size([1, 14597]) Labels shape: torch.Size([1, 14597]) Final batch size: 1, sequence length: 18050 Attention mask shape: torch.Size([1, 1, 18050, 18050]) Position ids shape: torch.Size([1, 18050]) Input IDs shape: torch.Size([1, 18050]) Labels shape: torch.Size([1, 18050]) Final batch size: 1, sequence length: 15438 Attention mask shape: torch.Size([1, 1, 15438, 15438]) Position ids shape: torch.Size([1, 15438]) Input IDs shape: torch.Size([1, 15438]) Labels shape: torch.Size([1, 15438]) Final batch size: 1, sequence length: 18264 Attention mask shape: torch.Size([1, 1, 18264, 18264]) Position ids shape: torch.Size([1, 18264]) Input IDs shape: torch.Size([1, 18264]) Labels shape: torch.Size([1, 18264]) Final batch size: 1, sequence length: 20487 Attention mask shape: torch.Size([1, 1, 20487, 20487]) Position ids shape: torch.Size([1, 20487]) Input IDs shape: torch.Size([1, 20487]) Labels shape: torch.Size([1, 20487]) Final batch size: 1, sequence length: 20947 Attention mask shape: torch.Size([1, 1, 20947, 20947]) Position ids shape: torch.Size([1, 20947]) Input IDs shape: torch.Size([1, 20947]) Labels shape: torch.Size([1, 20947]) Final batch size: 1, sequence length: 11206 Attention mask shape: torch.Size([1, 1, 11206, 11206]) Position ids shape: torch.Size([1, 11206]) Input IDs shape: torch.Size([1, 11206]) Labels shape: torch.Size([1, 11206]) Final batch size: 1, sequence length: 14335 Attention mask shape: torch.Size([1, 1, 14335, 14335]) Position ids shape: torch.Size([1, 14335]) Input IDs shape: torch.Size([1, 14335]) Labels shape: torch.Size([1, 14335]) Final batch size: 1, sequence length: 18131 Attention mask shape: torch.Size([1, 1, 18131, 18131]) Position ids shape: torch.Size([1, 18131]) Input IDs shape: torch.Size([1, 18131]) Labels shape: torch.Size([1, 18131]) Final batch size: 1, sequence length: 13061 Attention mask shape: torch.Size([1, 1, 13061, 13061]) Position ids shape: torch.Size([1, 13061]) Input IDs shape: torch.Size([1, 13061]) Labels shape: torch.Size([1, 13061]) Final batch size: 1, sequence length: 18950 Attention mask shape: torch.Size([1, 1, 18950, 18950]) Position ids shape: torch.Size([1, 18950]) Input IDs shape: torch.Size([1, 18950]) Labels shape: torch.Size([1, 18950]) Final batch size: 1, sequence length: 22107 Attention mask shape: torch.Size([1, 1, 22107, 22107]) Position ids shape: torch.Size([1, 22107]) Input IDs shape: torch.Size([1, 22107]) Labels shape: torch.Size([1, 22107]) Final batch size: 1, sequence length: 23324 Attention mask shape: torch.Size([1, 1, 23324, 23324]) Position ids shape: torch.Size([1, 23324]) Input IDs shape: torch.Size([1, 23324]) Labels shape: torch.Size([1, 23324]) Final batch size: 1, sequence length: 24566 Attention mask shape: torch.Size([1, 1, 24566, 24566]) Position ids shape: torch.Size([1, 24566]) Input IDs shape: torch.Size([1, 24566]) Labels shape: torch.Size([1, 24566]) Final batch size: 1, sequence length: 22547 Attention mask shape: torch.Size([1, 1, 22547, 22547]) Position ids shape: torch.Size([1, 22547]) Input IDs shape: torch.Size([1, 22547]) Labels shape: torch.Size([1, 22547]) Final batch size: 1, sequence length: 24432 Attention mask shape: torch.Size([1, 1, 24432, 24432]) Position ids shape: torch.Size([1, 24432]) Input IDs shape: torch.Size([1, 24432]) Labels shape: torch.Size([1, 24432]) Final batch size: 1, sequence length: 17194 Attention mask shape: torch.Size([1, 1, 17194, 17194]) Position ids shape: torch.Size([1, 17194]) Input IDs shape: torch.Size([1, 17194]) Labels shape: torch.Size([1, 17194]) Final batch size: 1, sequence length: 26068 Attention mask shape: torch.Size([1, 1, 26068, 26068]) Position ids shape: torch.Size([1, 26068]) Input IDs shape: torch.Size([1, 26068]) Labels shape: torch.Size([1, 26068]) Final batch size: 1, sequence length: 23960 Attention mask shape: torch.Size([1, 1, 23960, 23960]) Position ids shape: torch.Size([1, 23960]) Input IDs shape: torch.Size([1, 23960]) Labels shape: torch.Size([1, 23960]) Final batch size: 1, sequence length: 21137 Attention mask shape: torch.Size([1, 1, 21137, 21137]) Position ids shape: torch.Size([1, 21137]) Input IDs shape: torch.Size([1, 21137]) Labels shape: torch.Size([1, 21137]) Final batch size: 1, sequence length: 15229 Attention mask shape: torch.Size([1, 1, 15229, 15229]) Position ids shape: torch.Size([1, 15229]) Input IDs shape: torch.Size([1, 15229]) Labels shape: torch.Size([1, 15229]) Final batch size: 1, sequence length: 17115 Attention mask shape: torch.Size([1, 1, 17115, 17115]) Position ids shape: torch.Size([1, 17115]) Input IDs shape: torch.Size([1, 17115]) Labels shape: torch.Size([1, 17115]) Final batch size: 1, sequence length: 26333 Attention mask shape: torch.Size([1, 1, 26333, 26333]) Position ids shape: torch.Size([1, 26333]) Input IDs shape: torch.Size([1, 26333]) Labels shape: torch.Size([1, 26333]) Final batch size: 1, sequence length: 29236 Attention mask shape: torch.Size([1, 1, 29236, 29236]) Position ids shape: torch.Size([1, 29236]) Input IDs shape: torch.Size([1, 29236]) Labels shape: torch.Size([1, 29236]) Final batch size: 1, sequence length: 28858 Attention mask shape: torch.Size([1, 1, 28858, 28858]) Position ids shape: torch.Size([1, 28858]) Input IDs shape: torch.Size([1, 28858]) Labels shape: torch.Size([1, 28858]) Final batch size: 1, sequence length: 28284 Attention mask shape: torch.Size([1, 1, 28284, 28284]) Position ids shape: torch.Size([1, 28284]) Input IDs shape: torch.Size([1, 28284]) Labels shape: torch.Size([1, 28284]) Final batch size: 1, sequence length: 21728 Final batch size: 1, sequence length: 26271 Attention mask shape: torch.Size([1, 1, 26271, 26271]) Position ids shape: torch.Size([1, 26271]) Input IDs shape: torch.Size([1, 26271]) Labels shape: torch.Size([1, 26271]) Attention mask shape: torch.Size([1, 1, 21728, 21728]) Position ids shape: torch.Size([1, 21728]) Input IDs shape: torch.Size([1, 21728]) Labels shape: torch.Size([1, 21728]) Final batch size: 1, sequence length: 29768 Attention mask shape: torch.Size([1, 1, 29768, 29768]) Position ids shape: torch.Size([1, 29768]) Input IDs shape: torch.Size([1, 29768]) Labels shape: torch.Size([1, 29768]) Final batch size: 1, sequence length: 27293 Attention mask shape: torch.Size([1, 1, 27293, 27293]) Position ids shape: torch.Size([1, 27293]) Input IDs shape: torch.Size([1, 27293]) Labels shape: torch.Size([1, 27293]) Final batch size: 1, sequence length: 27278 Attention mask shape: torch.Size([1, 1, 27278, 27278]) Position ids shape: torch.Size([1, 27278]) Input IDs shape: torch.Size([1, 27278]) Labels shape: torch.Size([1, 27278]) Final batch size: 1, sequence length: 18051 Attention mask shape: torch.Size([1, 1, 18051, 18051]) Position ids shape: torch.Size([1, 18051]) Input IDs shape: torch.Size([1, 18051]) Labels shape: torch.Size([1, 18051]) Final batch size: 1, sequence length: 25252 Attention mask shape: torch.Size([1, 1, 25252, 25252]) Position ids shape: torch.Size([1, 25252]) Input IDs shape: torch.Size([1, 25252]) Labels shape: torch.Size([1, 25252]) Final batch size: 1, sequence length: 10286 Attention mask shape: torch.Size([1, 1, 10286, 10286]) Position ids shape: torch.Size([1, 10286]) Input IDs shape: torch.Size([1, 10286]) Labels shape: torch.Size([1, 10286]) Final batch size: 1, sequence length: 29478 Attention mask shape: torch.Size([1, 1, 29478, 29478]) Position ids shape: torch.Size([1, 29478]) Input IDs shape: torch.Size([1, 29478]) Labels shape: torch.Size([1, 29478]) Final batch size: 1, sequence length: 26937 Attention mask shape: torch.Size([1, 1, 26937, 26937]) Position ids shape: torch.Size([1, 26937]) Input IDs shape: torch.Size([1, 26937]) Labels shape: torch.Size([1, 26937]) Final batch size: 1, sequence length: 30862 Attention mask shape: torch.Size([1, 1, 30862, 30862]) Position ids shape: torch.Size([1, 30862]) Input IDs shape: torch.Size([1, 30862]) Labels shape: torch.Size([1, 30862]) Final batch size: 1, sequence length: 29592 Attention mask shape: torch.Size([1, 1, 29592, 29592]) Position ids shape: torch.Size([1, 29592]) Input IDs shape: torch.Size([1, 29592]) Labels shape: torch.Size([1, 29592]) Final batch size: 1, sequence length: 18408 Attention mask shape: torch.Size([1, 1, 18408, 18408]) Position ids shape: torch.Size([1, 18408]) Input IDs shape: torch.Size([1, 18408]) Labels shape: torch.Size([1, 18408]) Final batch size: 1, sequence length: 32112 Attention mask shape: torch.Size([1, 1, 32112, 32112]) Position ids shape: torch.Size([1, 32112]) Input IDs shape: torch.Size([1, 32112]) Labels shape: torch.Size([1, 32112]) Final batch size: 1, sequence length: 31860 Attention mask shape: torch.Size([1, 1, 31860, 31860]) Position ids shape: torch.Size([1, 31860]) Input IDs shape: torch.Size([1, 31860]) Labels shape: torch.Size([1, 31860]) Final batch size: 1, sequence length: 32392 Attention mask shape: torch.Size([1, 1, 32392, 32392]) Position ids shape: torch.Size([1, 32392]) Input IDs shape: torch.Size([1, 32392]) Labels shape: torch.Size([1, 32392]) Final batch size: 1, sequence length: 28421 Attention mask shape: torch.Size([1, 1, 28421, 28421]) Position ids shape: torch.Size([1, 28421]) Input IDs shape: torch.Size([1, 28421]) Labels shape: torch.Size([1, 28421]) Final batch size: 1, sequence length: 33736 Attention mask shape: torch.Size([1, 1, 33736, 33736]) Position ids shape: torch.Size([1, 33736]) Input IDs shape: torch.Size([1, 33736]) Labels shape: torch.Size([1, 33736]) Final batch size: 1, sequence length: 31377 Attention mask shape: torch.Size([1, 1, 31377, 31377]) Position ids shape: torch.Size([1, 31377]) Input IDs shape: torch.Size([1, 31377]) Labels shape: torch.Size([1, 31377]) Final batch size: 1, sequence length: 13257 Attention mask shape: torch.Size([1, 1, 13257, 13257]) Position ids shape: torch.Size([1, 13257]) Input IDs shape: torch.Size([1, 13257]) Labels shape: torch.Size([1, 13257]) Final batch size: 1, sequence length: 9974 Attention mask shape: torch.Size([1, 1, 9974, 9974]) Position ids shape: torch.Size([1, 9974]) Input IDs shape: torch.Size([1, 9974]) Labels shape: torch.Size([1, 9974]) Final batch size: 1, sequence length: 19888 Attention mask shape: torch.Size([1, 1, 19888, 19888]) Position ids shape: torch.Size([1, 19888]) Input IDs shape: torch.Size([1, 19888]) Labels shape: torch.Size([1, 19888]) Final batch size: 1, sequence length: 7364 Attention mask shape: torch.Size([1, 1, 7364, 7364]) Position ids shape: torch.Size([1, 7364]) Input IDs shape: torch.Size([1, 7364]) Labels shape: torch.Size([1, 7364]) Final batch size: 1, sequence length: 32181 Attention mask shape: torch.Size([1, 1, 32181, 32181]) Position ids shape: torch.Size([1, 32181]) Input IDs shape: torch.Size([1, 32181]) Labels shape: torch.Size([1, 32181]) Final batch size: 1, sequence length: 31740 Attention mask shape: torch.Size([1, 1, 31740, 31740]) Position ids shape: torch.Size([1, 31740]) Input IDs shape: torch.Size([1, 31740]) Labels shape: torch.Size([1, 31740]) Final batch size: 1, sequence length: 32162 Attention mask shape: torch.Size([1, 1, 32162, 32162]) Position ids shape: torch.Size([1, 32162]) Input IDs shape: torch.Size([1, 32162]) Labels shape: torch.Size([1, 32162]) Final batch size: 1, sequence length: 29489 Attention mask shape: torch.Size([1, 1, 29489, 29489]) Position ids shape: torch.Size([1, 29489]) Input IDs shape: torch.Size([1, 29489]) Labels shape: torch.Size([1, 29489]) Final batch size: 1, sequence length: 25484 Attention mask shape: torch.Size([1, 1, 25484, 25484]) Position ids shape: torch.Size([1, 25484]) Input IDs shape: torch.Size([1, 25484]) Labels shape: torch.Size([1, 25484]) Final batch size: 1, sequence length: 33097 Attention mask shape: torch.Size([1, 1, 33097, 33097]) Position ids shape: torch.Size([1, 33097]) Input IDs shape: torch.Size([1, 33097]) Labels shape: torch.Size([1, 33097]) Final batch size: 1, sequence length: 35596 Attention mask shape: torch.Size([1, 1, 35596, 35596]) Position ids shape: torch.Size([1, 35596]) Input IDs shape: torch.Size([1, 35596]) Labels shape: torch.Size([1, 35596]) Final batch size: 1, sequence length: 37623 Attention mask shape: torch.Size([1, 1, 37623, 37623]) Position ids shape: torch.Size([1, 37623]) Input IDs shape: torch.Size([1, 37623]) Labels shape: torch.Size([1, 37623]) Final batch size: 1, sequence length: 35069 Attention mask shape: torch.Size([1, 1, 35069, 35069]) Position ids shape: torch.Size([1, 35069]) Input IDs shape: torch.Size([1, 35069]) Labels shape: torch.Size([1, 35069]) Final batch size: 1, sequence length: 18711 Attention mask shape: torch.Size([1, 1, 18711, 18711]) Position ids shape: torch.Size([1, 18711]) Input IDs shape: torch.Size([1, 18711]) Labels shape: torch.Size([1, 18711]) Final batch size: 1, sequence length: 30051 Attention mask shape: torch.Size([1, 1, 30051, 30051]) Position ids shape: torch.Size([1, 30051]) Input IDs shape: torch.Size([1, 30051]) Labels shape: torch.Size([1, 30051]) Final batch size: 1, sequence length: 35171 Attention mask shape: torch.Size([1, 1, 35171, 35171]) Position ids shape: torch.Size([1, 35171]) Input IDs shape: torch.Size([1, 35171]) Labels shape: torch.Size([1, 35171]) Final batch size: 1, sequence length: 34514 Attention mask shape: torch.Size([1, 1, 34514, 34514]) Position ids shape: torch.Size([1, 34514]) Input IDs shape: torch.Size([1, 34514]) Labels shape: torch.Size([1, 34514]) Final batch size: 1, sequence length: 30302 Attention mask shape: torch.Size([1, 1, 30302, 30302]) Position ids shape: torch.Size([1, 30302]) Input IDs shape: torch.Size([1, 30302]) Labels shape: torch.Size([1, 30302]) Final batch size: 1, sequence length: 17777 Attention mask shape: torch.Size([1, 1, 17777, 17777]) Position ids shape: torch.Size([1, 17777]) Final batch size: 1, sequence length: 21500 Input IDs shape: torch.Size([1, 17777]) Labels shape: torch.Size([1, 17777]) Attention mask shape: torch.Size([1, 1, 21500, 21500]) Position ids shape: torch.Size([1, 21500]) Input IDs shape: torch.Size([1, 21500]) Labels shape: torch.Size([1, 21500]) Final batch size: 1, sequence length: 24061 Attention mask shape: torch.Size([1, 1, 24061, 24061]) Position ids shape: torch.Size([1, 24061]) Input IDs shape: torch.Size([1, 24061]) Labels shape: torch.Size([1, 24061]) Final batch size: 1, sequence length: 30639 Attention mask shape: torch.Size([1, 1, 30639, 30639]) Position ids shape: torch.Size([1, 30639]) Input IDs shape: torch.Size([1, 30639]) Labels shape: torch.Size([1, 30639]) Final batch size: 1, sequence length: 39754 Attention mask shape: torch.Size([1, 1, 39754, 39754]) Position ids shape: torch.Size([1, 39754]) Input IDs shape: torch.Size([1, 39754]) Labels shape: torch.Size([1, 39754]) Final batch size: 1, sequence length: 32979 Attention mask shape: torch.Size([1, 1, 32979, 32979]) Position ids shape: torch.Size([1, 32979]) Input IDs shape: torch.Size([1, 32979]) Labels shape: torch.Size([1, 32979]) Final batch size: 1, sequence length: 25386 Attention mask shape: torch.Size([1, 1, 25386, 25386]) Position ids shape: torch.Size([1, 25386]) Input IDs shape: torch.Size([1, 25386]) Labels shape: torch.Size([1, 25386]) Final batch size: 1, sequence length: 31679 Attention mask shape: torch.Size([1, 1, 31679, 31679]) Position ids shape: torch.Size([1, 31679]) Input IDs shape: torch.Size([1, 31679]) Labels shape: torch.Size([1, 31679]) Final batch size: 1, sequence length: 13557 Attention mask shape: torch.Size([1, 1, 13557, 13557]) Position ids shape: torch.Size([1, 13557]) Input IDs shape: torch.Size([1, 13557]) Labels shape: torch.Size([1, 13557]) Final batch size: 1, sequence length: 38351 Attention mask shape: torch.Size([1, 1, 38351, 38351]) Position ids shape: torch.Size([1, 38351]) Input IDs shape: torch.Size([1, 38351]) Labels shape: torch.Size([1, 38351]) Final batch size: 1, sequence length: 38790 Attention mask shape: torch.Size([1, 1, 38790, 38790]) Position ids shape: torch.Size([1, 38790]) Input IDs shape: torch.Size([1, 38790]) Labels shape: torch.Size([1, 38790]) Final batch size: 1, sequence length: 21408 Attention mask shape: torch.Size([1, 1, 21408, 21408]) Position ids shape: torch.Size([1, 21408]) Input IDs shape: torch.Size([1, 21408]) Labels shape: torch.Size([1, 21408]) Final batch size: 1, sequence length: 34170 Attention mask shape: torch.Size([1, 1, 34170, 34170]) Position ids shape: torch.Size([1, 34170]) Input IDs shape: torch.Size([1, 34170]) Labels shape: torch.Size([1, 34170]) Final batch size: 1, sequence length: 24943 Attention mask shape: torch.Size([1, 1, 24943, 24943]) Position ids shape: torch.Size([1, 24943]) Input IDs shape: torch.Size([1, 24943]) Labels shape: torch.Size([1, 24943]) Final batch size: 1, sequence length: 36535 Attention mask shape: torch.Size([1, 1, 36535, 36535]) Position ids shape: torch.Size([1, 36535]) Input IDs shape: torch.Size([1, 36535]) Labels shape: torch.Size([1, 36535]) Final batch size: 1, sequence length: 34871 Attention mask shape: torch.Size([1, 1, 34871, 34871]) Position ids shape: torch.Size([1, 34871]) Input IDs shape: torch.Size([1, 34871]) Labels shape: torch.Size([1, 34871]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 15697 Attention mask shape: torch.Size([1, 1, 15697, 15697]) Position ids shape: torch.Size([1, 15697]) Input IDs shape: torch.Size([1, 15697]) Labels shape: torch.Size([1, 15697]) Final batch size: 1, sequence length: 21825 Attention mask shape: torch.Size([1, 1, 21825, 21825]) Position ids shape: torch.Size([1, 21825]) Input IDs shape: torch.Size([1, 21825]) Labels shape: torch.Size([1, 21825]) Final batch size: 1, sequence length: 33807 Attention mask shape: torch.Size([1, 1, 33807, 33807]) Position ids shape: torch.Size([1, 33807]) Input IDs shape: torch.Size([1, 33807]) Labels shape: torch.Size([1, 33807]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 23243 Attention mask shape: torch.Size([1, 1, 23243, 23243]) Position ids shape: torch.Size([1, 23243]) Input IDs shape: torch.Size([1, 23243]) Labels shape: torch.Size([1, 23243]) Final batch size: 1, sequence length: 18106 Attention mask shape: torch.Size([1, 1, 18106, 18106]) Position ids shape: torch.Size([1, 18106]) Input IDs shape: torch.Size([1, 18106]) Labels shape: torch.Size([1, 18106]) Final batch size: 1, sequence length: 24910 Attention mask shape: torch.Size([1, 1, 24910, 24910]) Position ids shape: torch.Size([1, 24910]) Input IDs shape: torch.Size([1, 24910]) Labels shape: torch.Size([1, 24910]) Final batch size: 1, sequence length: 24840 Attention mask shape: torch.Size([1, 1, 24840, 24840]) Position ids shape: torch.Size([1, 24840]) Input IDs shape: torch.Size([1, 24840]) Labels shape: torch.Size([1, 24840]) Final batch size: 1, sequence length: 32327 Attention mask shape: torch.Size([1, 1, 32327, 32327]) Position ids shape: torch.Size([1, 32327]) Input IDs shape: torch.Size([1, 32327]) Labels shape: torch.Size([1, 32327]) Final batch size: 1, sequence length: 33883 Attention mask shape: torch.Size([1, 1, 33883, 33883]) Position ids shape: torch.Size([1, 33883]) Input IDs shape: torch.Size([1, 33883]) Labels shape: torch.Size([1, 33883]) Final batch size: 1, sequence length: 30361 Attention mask shape: torch.Size([1, 1, 30361, 30361]) Position ids shape: torch.Size([1, 30361]) Input IDs shape: torch.Size([1, 30361]) Labels shape: torch.Size([1, 30361]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 31513 Attention mask shape: torch.Size([1, 1, 31513, 31513]) Position ids shape: torch.Size([1, 31513]) Input IDs shape: torch.Size([1, 31513]) Labels shape: torch.Size([1, 31513]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 39483 Attention mask shape: torch.Size([1, 1, 39483, 39483]) Position ids shape: torch.Size([1, 39483]) Input IDs shape: torch.Size([1, 39483]) Labels shape: torch.Size([1, 39483]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 29691 Attention mask shape: torch.Size([1, 1, 29691, 29691]) Position ids shape: torch.Size([1, 29691]) Input IDs shape: torch.Size([1, 29691]) Labels shape: torch.Size([1, 29691]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 32351 Attention mask shape: torch.Size([1, 1, 32351, 32351]) Position ids shape: torch.Size([1, 32351]) Input IDs shape: torch.Size([1, 32351]) Labels shape: torch.Size([1, 32351]) Final batch size: 1, sequence length: 27228 Attention mask shape: torch.Size([1, 1, 27228, 27228]) Position ids shape: torch.Size([1, 27228]) Input IDs shape: torch.Size([1, 27228]) Labels shape: torch.Size([1, 27228]) Final batch size: 1, sequence length: 33549 Attention mask shape: torch.Size([1, 1, 33549, 33549]) Position ids shape: torch.Size([1, 33549]) Input IDs shape: torch.Size([1, 33549]) Labels shape: torch.Size([1, 33549]) Final batch size: 1, sequence length: 15180 Attention mask shape: torch.Size([1, 1, 15180, 15180]) Position ids shape: torch.Size([1, 15180]) Input IDs shape: torch.Size([1, 15180]) Labels shape: torch.Size([1, 15180]) Final batch size: 1, sequence length: 13064 Attention mask shape: torch.Size([1, 1, 13064, 13064]) Position ids shape: torch.Size([1, 13064]) Input IDs shape: torch.Size([1, 13064]) Labels shape: torch.Size([1, 13064]) Final batch size: 1, sequence length: 30419 Attention mask shape: torch.Size([1, 1, 30419, 30419]) Position ids shape: torch.Size([1, 30419]) Input IDs shape: torch.Size([1, 30419]) Labels shape: torch.Size([1, 30419]) Final batch size: 1, sequence length: 36532 Attention mask shape: torch.Size([1, 1, 36532, 36532]) Position ids shape: torch.Size([1, 36532]) Input IDs shape: torch.Size([1, 36532]) Labels shape: torch.Size([1, 36532]) {'loss': 0.2624, 'grad_norm': 0.31747482374393754, 'learning_rate': 4.217827674798845e-06, 'num_tokens': -inf, 'epoch': 4.75} Final batch size: 1, sequence length: 8845 Attention mask shape: torch.Size([1, 1, 8845, 8845]) Position ids shape: torch.Size([1, 8845]) Input IDs shape: torch.Size([1, 8845]) Labels shape: torch.Size([1, 8845]) Final batch size: 1, sequence length: 7235 Attention mask shape: torch.Size([1, 1, 7235, 7235]) Position ids shape: torch.Size([1, 7235]) Input IDs shape: torch.Size([1, 7235]) Labels shape: torch.Size([1, 7235]) Final batch size: 1, sequence length: 12215 Attention mask shape: torch.Size([1, 1, 12215, 12215]) Position ids shape: torch.Size([1, 12215]) Input IDs shape: torch.Size([1, 12215]) Labels shape: torch.Size([1, 12215]) Final batch size: 1, sequence length: 12830 Attention mask shape: torch.Size([1, 1, 12830, 12830]) Position ids shape: torch.Size([1, 12830]) Input IDs shape: torch.Size([1, 12830]) Labels shape: torch.Size([1, 12830]) Final batch size: 1, sequence length: 13665 Attention mask shape: torch.Size([1, 1, 13665, 13665]) Position ids shape: torch.Size([1, 13665]) Input IDs shape: torch.Size([1, 13665]) Labels shape: torch.Size([1, 13665]) Final batch size: 1, sequence length: 13575 Attention mask shape: torch.Size([1, 1, 13575, 13575]) Position ids shape: torch.Size([1, 13575]) Input IDs shape: torch.Size([1, 13575]) Labels shape: torch.Size([1, 13575]) Final batch size: 1, sequence length: 12562 Attention mask shape: torch.Size([1, 1, 12562, 12562]) Position ids shape: torch.Size([1, 12562]) Input IDs shape: torch.Size([1, 12562]) Labels shape: torch.Size([1, 12562]) Final batch size: 1, sequence length: 12960 Attention mask shape: torch.Size([1, 1, 12960, 12960]) Position ids shape: torch.Size([1, 12960]) Input IDs shape: torch.Size([1, 12960]) Labels shape: torch.Size([1, 12960]) Final batch size: 1, sequence length: 14689 Attention mask shape: torch.Size([1, 1, 14689, 14689]) Position ids shape: torch.Size([1, 14689]) Input IDs shape: torch.Size([1, 14689]) Labels shape: torch.Size([1, 14689]) Final batch size: 1, sequence length: 17026 Attention mask shape: torch.Size([1, 1, 17026, 17026]) Position ids shape: torch.Size([1, 17026]) Input IDs shape: torch.Size([1, 17026]) Labels shape: torch.Size([1, 17026]) Final batch size: 1, sequence length: 14833 Attention mask shape: torch.Size([1, 1, 14833, 14833]) Position ids shape: torch.Size([1, 14833]) Input IDs shape: torch.Size([1, 14833]) Labels shape: torch.Size([1, 14833]) Final batch size: 1, sequence length: 12622 Attention mask shape: torch.Size([1, 1, 12622, 12622]) Position ids shape: torch.Size([1, 12622]) Input IDs shape: torch.Size([1, 12622]) Labels shape: torch.Size([1, 12622]) Final batch size: 1, sequence length: 14482 Attention mask shape: torch.Size([1, 1, 14482, 14482]) Position ids shape: torch.Size([1, 14482]) Input IDs shape: torch.Size([1, 14482]) Labels shape: torch.Size([1, 14482]) Final batch size: 1, sequence length: 17988 Attention mask shape: torch.Size([1, 1, 17988, 17988]) Position ids shape: torch.Size([1, 17988]) Input IDs shape: torch.Size([1, 17988]) Labels shape: torch.Size([1, 17988]) Final batch size: 1, sequence length: 16145 Attention mask shape: torch.Size([1, 1, 16145, 16145]) Position ids shape: torch.Size([1, 16145]) Input IDs shape: torch.Size([1, 16145]) Labels shape: torch.Size([1, 16145]) Final batch size: 1, sequence length: 17016 Attention mask shape: torch.Size([1, 1, 17016, 17016]) Position ids shape: torch.Size([1, 17016]) Input IDs shape: torch.Size([1, 17016]) Labels shape: torch.Size([1, 17016]) Final batch size: 1, sequence length: 18922 Attention mask shape: torch.Size([1, 1, 18922, 18922]) Position ids shape: torch.Size([1, 18922]) Input IDs shape: torch.Size([1, 18922]) Labels shape: torch.Size([1, 18922]) Final batch size: 1, sequence length: 17512 Attention mask shape: torch.Size([1, 1, 17512, 17512]) Position ids shape: torch.Size([1, 17512]) Input IDs shape: torch.Size([1, 17512]) Labels shape: torch.Size([1, 17512]) Final batch size: 1, sequence length: 19409 Attention mask shape: torch.Size([1, 1, 19409, 19409]) Position ids shape: torch.Size([1, 19409]) Input IDs shape: torch.Size([1, 19409]) Labels shape: torch.Size([1, 19409]) Final batch size: 1, sequence length: 15673 Attention mask shape: torch.Size([1, 1, 15673, 15673]) Position ids shape: torch.Size([1, 15673]) Input IDs shape: torch.Size([1, 15673]) Labels shape: torch.Size([1, 15673]) Final batch size: 1, sequence length: 19225 Attention mask shape: torch.Size([1, 1, 19225, 19225]) Position ids shape: torch.Size([1, 19225]) Input IDs shape: torch.Size([1, 19225]) Labels shape: torch.Size([1, 19225]) Final batch size: 1, sequence length: 15816 Attention mask shape: torch.Size([1, 1, 15816, 15816]) Position ids shape: torch.Size([1, 15816]) Input IDs shape: torch.Size([1, 15816]) Labels shape: torch.Size([1, 15816]) Final batch size: 1, sequence length: 18978 Attention mask shape: torch.Size([1, 1, 18978, 18978]) Position ids shape: torch.Size([1, 18978]) Input IDs shape: torch.Size([1, 18978]) Labels shape: torch.Size([1, 18978]) Final batch size: 1, sequence length: 21075 Attention mask shape: torch.Size([1, 1, 21075, 21075]) Position ids shape: torch.Size([1, 21075]) Input IDs shape: torch.Size([1, 21075]) Labels shape: torch.Size([1, 21075]) Final batch size: 1, sequence length: 20630 Attention mask shape: torch.Size([1, 1, 20630, 20630]) Position ids shape: torch.Size([1, 20630]) Input IDs shape: torch.Size([1, 20630]) Labels shape: torch.Size([1, 20630]) Final batch size: 1, sequence length: 20389 Attention mask shape: torch.Size([1, 1, 20389, 20389]) Position ids shape: torch.Size([1, 20389]) Input IDs shape: torch.Size([1, 20389]) Labels shape: torch.Size([1, 20389]) Final batch size: 1, sequence length: 14577 Attention mask shape: torch.Size([1, 1, 14577, 14577]) Position ids shape: torch.Size([1, 14577]) Input IDs shape: torch.Size([1, 14577]) Labels shape: torch.Size([1, 14577]) Final batch size: 1, sequence length: 15222 Attention mask shape: torch.Size([1, 1, 15222, 15222]) Position ids shape: torch.Size([1, 15222]) Input IDs shape: torch.Size([1, 15222]) Labels shape: torch.Size([1, 15222]) Final batch size: 1, sequence length: 10494 Attention mask shape: torch.Size([1, 1, 10494, 10494]) Position ids shape: torch.Size([1, 10494]) Input IDs shape: torch.Size([1, 10494]) Labels shape: torch.Size([1, 10494]) Final batch size: 1, sequence length: 19325 Attention mask shape: torch.Size([1, 1, 19325, 19325]) Position ids shape: torch.Size([1, 19325]) Input IDs shape: torch.Size([1, 19325]) Labels shape: torch.Size([1, 19325]) Final batch size: 1, sequence length: 21598 Attention mask shape: torch.Size([1, 1, 21598, 21598]) Position ids shape: torch.Size([1, 21598]) Input IDs shape: torch.Size([1, 21598]) Labels shape: torch.Size([1, 21598]) Final batch size: 1, sequence length: 20784 Attention mask shape: torch.Size([1, 1, 20784, 20784]) Position ids shape: torch.Size([1, 20784]) Input IDs shape: torch.Size([1, 20784]) Labels shape: torch.Size([1, 20784]) Final batch size: 1, sequence length: 22949 Attention mask shape: torch.Size([1, 1, 22949, 22949]) Position ids shape: torch.Size([1, 22949]) Input IDs shape: torch.Size([1, 22949]) Labels shape: torch.Size([1, 22949]) Final batch size: 1, sequence length: 19724 Attention mask shape: torch.Size([1, 1, 19724, 19724]) Position ids shape: torch.Size([1, 19724]) Input IDs shape: torch.Size([1, 19724]) Labels shape: torch.Size([1, 19724]) Final batch size: 1, sequence length: 6618 Attention mask shape: torch.Size([1, 1, 6618, 6618]) Position ids shape: torch.Size([1, 6618]) Input IDs shape: torch.Size([1, 6618]) Labels shape: torch.Size([1, 6618]) Final batch size: 1, sequence length: 20465 Attention mask shape: torch.Size([1, 1, 20465, 20465]) Position ids shape: torch.Size([1, 20465]) Input IDs shape: torch.Size([1, 20465]) Labels shape: torch.Size([1, 20465]) Final batch size: 1, sequence length: 20991 Attention mask shape: torch.Size([1, 1, 20991, 20991]) Position ids shape: torch.Size([1, 20991]) Input IDs shape: torch.Size([1, 20991]) Labels shape: torch.Size([1, 20991]) Final batch size: 1, sequence length: 20585 Attention mask shape: torch.Size([1, 1, 20585, 20585]) Position ids shape: torch.Size([1, 20585]) Input IDs shape: torch.Size([1, 20585]) Labels shape: torch.Size([1, 20585]) Final batch size: 1, sequence length: 12969 Attention mask shape: torch.Size([1, 1, 12969, 12969]) Position ids shape: torch.Size([1, 12969]) Input IDs shape: torch.Size([1, 12969]) Labels shape: torch.Size([1, 12969]) Final batch size: 1, sequence length: 23805 Attention mask shape: torch.Size([1, 1, 23805, 23805]) Position ids shape: torch.Size([1, 23805]) Input IDs shape: torch.Size([1, 23805]) Labels shape: torch.Size([1, 23805]) Final batch size: 1, sequence length: 13072 Attention mask shape: torch.Size([1, 1, 13072, 13072]) Position ids shape: torch.Size([1, 13072]) Input IDs shape: torch.Size([1, 13072]) Labels shape: torch.Size([1, 13072]) Final batch size: 1, sequence length: 3010 Attention mask shape: torch.Size([1, 1, 3010, 3010]) Position ids shape: torch.Size([1, 3010]) Input IDs shape: torch.Size([1, 3010]) Labels shape: torch.Size([1, 3010]) Final batch size: 1, sequence length: 23293 Attention mask shape: torch.Size([1, 1, 23293, 23293]) Position ids shape: torch.Size([1, 23293]) Input IDs shape: torch.Size([1, 23293]) Labels shape: torch.Size([1, 23293]) Final batch size: 1, sequence length: 23755 Attention mask shape: torch.Size([1, 1, 23755, 23755]) Position ids shape: torch.Size([1, 23755]) Input IDs shape: torch.Size([1, 23755]) Labels shape: torch.Size([1, 23755]) Final batch size: 1, sequence length: 22778 Attention mask shape: torch.Size([1, 1, 22778, 22778]) Position ids shape: torch.Size([1, 22778]) Input IDs shape: torch.Size([1, 22778]) Labels shape: torch.Size([1, 22778]) Final batch size: 1, sequence length: 21907 Attention mask shape: torch.Size([1, 1, 21907, 21907]) Position ids shape: torch.Size([1, 21907]) Input IDs shape: torch.Size([1, 21907]) Labels shape: torch.Size([1, 21907]) Final batch size: 1, sequence length: 26424 Attention mask shape: torch.Size([1, 1, 26424, 26424]) Position ids shape: torch.Size([1, 26424]) Input IDs shape: torch.Size([1, 26424]) Labels shape: torch.Size([1, 26424]) Final batch size: 1, sequence length: 25934 Attention mask shape: torch.Size([1, 1, 25934, 25934]) Position ids shape: torch.Size([1, 25934]) Input IDs shape: torch.Size([1, 25934]) Labels shape: torch.Size([1, 25934]) Final batch size: 1, sequence length: 14548 Attention mask shape: torch.Size([1, 1, 14548, 14548]) Position ids shape: torch.Size([1, 14548]) Input IDs shape: torch.Size([1, 14548]) Labels shape: torch.Size([1, 14548]) Final batch size: 1, sequence length: 26500 Attention mask shape: torch.Size([1, 1, 26500, 26500]) Position ids shape: torch.Size([1, 26500]) Input IDs shape: torch.Size([1, 26500]) Labels shape: torch.Size([1, 26500]) Final batch size: 1, sequence length: 23437 Attention mask shape: torch.Size([1, 1, 23437, 23437]) Position ids shape: torch.Size([1, 23437]) Input IDs shape: torch.Size([1, 23437]) Labels shape: torch.Size([1, 23437]) Final batch size: 1, sequence length: 22851 Attention mask shape: torch.Size([1, 1, 22851, 22851]) Position ids shape: torch.Size([1, 22851]) Input IDs shape: torch.Size([1, 22851]) Labels shape: torch.Size([1, 22851]) Final batch size: 1, sequence length: 28474 Attention mask shape: torch.Size([1, 1, 28474, 28474]) Position ids shape: torch.Size([1, 28474]) Input IDs shape: torch.Size([1, 28474]) Labels shape: torch.Size([1, 28474]) Final batch size: 1, sequence length: 28012 Attention mask shape: torch.Size([1, 1, 28012, 28012]) Position ids shape: torch.Size([1, 28012]) Input IDs shape: torch.Size([1, 28012]) Labels shape: torch.Size([1, 28012]) Final batch size: 1, sequence length: 27384 Attention mask shape: torch.Size([1, 1, 27384, 27384]) Position ids shape: torch.Size([1, 27384]) Input IDs shape: torch.Size([1, 27384]) Labels shape: torch.Size([1, 27384]) Final batch size: 1, sequence length: 21853 Attention mask shape: torch.Size([1, 1, 21853, 21853]) Position ids shape: torch.Size([1, 21853]) Input IDs shape: torch.Size([1, 21853]) Labels shape: torch.Size([1, 21853]) Final batch size: 1, sequence length: 24107 Attention mask shape: torch.Size([1, 1, 24107, 24107]) Position ids shape: torch.Size([1, 24107]) Input IDs shape: torch.Size([1, 24107]) Labels shape: torch.Size([1, 24107]) Final batch size: 1, sequence length: 21322 Attention mask shape: torch.Size([1, 1, 21322, 21322]) Position ids shape: torch.Size([1, 21322]) Input IDs shape: torch.Size([1, 21322]) Labels shape: torch.Size([1, 21322]) Final batch size: 1, sequence length: 23442 Attention mask shape: torch.Size([1, 1, 23442, 23442]) Position ids shape: torch.Size([1, 23442]) Input IDs shape: torch.Size([1, 23442]) Labels shape: torch.Size([1, 23442]) Final batch size: 1, sequence length: 26454 Attention mask shape: torch.Size([1, 1, 26454, 26454]) Position ids shape: torch.Size([1, 26454]) Input IDs shape: torch.Size([1, 26454]) Labels shape: torch.Size([1, 26454]) Final batch size: 1, sequence length: 11662 Attention mask shape: torch.Size([1, 1, 11662, 11662]) Position ids shape: torch.Size([1, 11662]) Input IDs shape: torch.Size([1, 11662]) Labels shape: torch.Size([1, 11662]) Final batch size: 1, sequence length: 20028 Attention mask shape: torch.Size([1, 1, 20028, 20028]) Position ids shape: torch.Size([1, 20028]) Input IDs shape: torch.Size([1, 20028]) Labels shape: torch.Size([1, 20028]) Final batch size: 1, sequence length: 30235 Attention mask shape: torch.Size([1, 1, 30235, 30235]) Position ids shape: torch.Size([1, 30235]) Input IDs shape: torch.Size([1, 30235]) Labels shape: torch.Size([1, 30235]) Final batch size: 1, sequence length: 19027 Attention mask shape: torch.Size([1, 1, 19027, 19027]) Position ids shape: torch.Size([1, 19027]) Input IDs shape: torch.Size([1, 19027]) Labels shape: torch.Size([1, 19027]) Final batch size: 1, sequence length: 14137 Attention mask shape: torch.Size([1, 1, 14137, 14137]) Position ids shape: torch.Size([1, 14137]) Input IDs shape: torch.Size([1, 14137]) Labels shape: torch.Size([1, 14137]) Final batch size: 1, sequence length: 24716 Attention mask shape: torch.Size([1, 1, 24716, 24716]) Position ids shape: torch.Size([1, 24716]) Input IDs shape: torch.Size([1, 24716]) Labels shape: torch.Size([1, 24716]) Final batch size: 1, sequence length: 27520 Attention mask shape: torch.Size([1, 1, 27520, 27520]) Position ids shape: torch.Size([1, 27520]) Input IDs shape: torch.Size([1, 27520]) Labels shape: torch.Size([1, 27520]) Final batch size: 1, sequence length: 24364 Attention mask shape: torch.Size([1, 1, 24364, 24364]) Position ids shape: torch.Size([1, 24364]) Input IDs shape: torch.Size([1, 24364]) Labels shape: torch.Size([1, 24364]) Final batch size: 1, sequence length: 20314 Attention mask shape: torch.Size([1, 1, 20314, 20314]) Position ids shape: torch.Size([1, 20314]) Input IDs shape: torch.Size([1, 20314]) Labels shape: torch.Size([1, 20314]) Final batch size: 1, sequence length: 24111 Attention mask shape: torch.Size([1, 1, 24111, 24111]) Position ids shape: torch.Size([1, 24111]) Input IDs shape: torch.Size([1, 24111]) Labels shape: torch.Size([1, 24111]) Final batch size: 1, sequence length: 22920 Attention mask shape: torch.Size([1, 1, 22920, 22920]) Position ids shape: torch.Size([1, 22920]) Input IDs shape: torch.Size([1, 22920]) Labels shape: torch.Size([1, 22920]) Final batch size: 1, sequence length: 12234 Attention mask shape: torch.Size([1, 1, 12234, 12234]) Position ids shape: torch.Size([1, 12234]) Input IDs shape: torch.Size([1, 12234]) Labels shape: torch.Size([1, 12234]) Final batch size: 1, sequence length: 30041 Attention mask shape: torch.Size([1, 1, 30041, 30041]) Position ids shape: torch.Size([1, 30041]) Input IDs shape: torch.Size([1, 30041]) Labels shape: torch.Size([1, 30041]) Final batch size: 1, sequence length: 24923 Attention mask shape: torch.Size([1, 1, 24923, 24923]) Position ids shape: torch.Size([1, 24923]) Input IDs shape: torch.Size([1, 24923]) Labels shape: torch.Size([1, 24923]) Final batch size: 1, sequence length: 12519 Attention mask shape: torch.Size([1, 1, 12519, 12519]) Position ids shape: torch.Size([1, 12519]) Input IDs shape: torch.Size([1, 12519]) Labels shape: torch.Size([1, 12519]) Final batch size: 1, sequence length: 24830 Attention mask shape: torch.Size([1, 1, 24830, 24830]) Position ids shape: torch.Size([1, 24830]) Input IDs shape: torch.Size([1, 24830]) Labels shape: torch.Size([1, 24830]) Final batch size: 1, sequence length: 23695 Attention mask shape: torch.Size([1, 1, 23695, 23695]) Position ids shape: torch.Size([1, 23695]) Input IDs shape: torch.Size([1, 23695]) Labels shape: torch.Size([1, 23695]) Final batch size: 1, sequence length: 33765 Attention mask shape: torch.Size([1, 1, 33765, 33765]) Position ids shape: torch.Size([1, 33765]) Input IDs shape: torch.Size([1, 33765]) Labels shape: torch.Size([1, 33765]) Final batch size: 1, sequence length: 27107 Attention mask shape: torch.Size([1, 1, 27107, 27107]) Position ids shape: torch.Size([1, 27107]) Input IDs shape: torch.Size([1, 27107]) Labels shape: torch.Size([1, 27107]) Final batch size: 1, sequence length: 33179 Attention mask shape: torch.Size([1, 1, 33179, 33179]) Position ids shape: torch.Size([1, 33179]) Input IDs shape: torch.Size([1, 33179]) Labels shape: torch.Size([1, 33179]) Final batch size: 1, sequence length: 21168 Attention mask shape: torch.Size([1, 1, 21168, 21168]) Position ids shape: torch.Size([1, 21168]) Input IDs shape: torch.Size([1, 21168]) Labels shape: torch.Size([1, 21168]) Final batch size: 1, sequence length: 23855 Attention mask shape: torch.Size([1, 1, 23855, 23855]) Position ids shape: torch.Size([1, 23855]) Input IDs shape: torch.Size([1, 23855]) Labels shape: torch.Size([1, 23855]) Final batch size: 1, sequence length: 34169 Attention mask shape: torch.Size([1, 1, 34169, 34169]) Position ids shape: torch.Size([1, 34169]) Input IDs shape: torch.Size([1, 34169]) Labels shape: torch.Size([1, 34169]) Final batch size: 1, sequence length: 29743 Attention mask shape: torch.Size([1, 1, 29743, 29743]) Position ids shape: torch.Size([1, 29743]) Input IDs shape: torch.Size([1, 29743]) Labels shape: torch.Size([1, 29743]) Final batch size: 1, sequence length: 17763 Attention mask shape: torch.Size([1, 1, 17763, 17763]) Position ids shape: torch.Size([1, 17763]) Input IDs shape: torch.Size([1, 17763]) Labels shape: torch.Size([1, 17763]) Final batch size: 1, sequence length: 29385 Attention mask shape: torch.Size([1, 1, 29385, 29385]) Position ids shape: torch.Size([1, 29385]) Input IDs shape: torch.Size([1, 29385]) Labels shape: torch.Size([1, 29385]) Final batch size: 1, sequence length: 31404 Attention mask shape: torch.Size([1, 1, 31404, 31404]) Position ids shape: torch.Size([1, 31404]) Input IDs shape: torch.Size([1, 31404]) Labels shape: torch.Size([1, 31404]) Final batch size: 1, sequence length: 33321 Attention mask shape: torch.Size([1, 1, 33321, 33321]) Position ids shape: torch.Size([1, 33321]) Input IDs shape: torch.Size([1, 33321]) Labels shape: torch.Size([1, 33321]) Final batch size: 1, sequence length: 19302 Attention mask shape: torch.Size([1, 1, 19302, 19302]) Position ids shape: torch.Size([1, 19302]) Input IDs shape: torch.Size([1, 19302]) Labels shape: torch.Size([1, 19302]) Final batch size: 1, sequence length: 27738 Attention mask shape: torch.Size([1, 1, 27738, 27738]) Position ids shape: torch.Size([1, 27738]) Input IDs shape: torch.Size([1, 27738]) Labels shape: torch.Size([1, 27738]) Final batch size: 1, sequence length: 29842 Attention mask shape: torch.Size([1, 1, 29842, 29842]) Position ids shape: torch.Size([1, 29842]) Input IDs shape: torch.Size([1, 29842]) Labels shape: torch.Size([1, 29842]) Final batch size: 1, sequence length: 38576 Attention mask shape: torch.Size([1, 1, 38576, 38576]) Position ids shape: torch.Size([1, 38576]) Input IDs shape: torch.Size([1, 38576]) Labels shape: torch.Size([1, 38576]) Final batch size: 1, sequence length: 37657 Attention mask shape: torch.Size([1, 1, 37657, 37657]) Position ids shape: torch.Size([1, 37657]) Input IDs shape: torch.Size([1, 37657]) Labels shape: torch.Size([1, 37657]) Final batch size: 1, sequence length: 40006 Attention mask shape: torch.Size([1, 1, 40006, 40006]) Position ids shape: torch.Size([1, 40006]) Input IDs shape: torch.Size([1, 40006]) Labels shape: torch.Size([1, 40006]) Final batch size: 1, sequence length: 10139 Attention mask shape: torch.Size([1, 1, 10139, 10139]) Position ids shape: torch.Size([1, 10139]) Input IDs shape: torch.Size([1, 10139]) Labels shape: torch.Size([1, 10139]) Final batch size: 1, sequence length: 35555 Attention mask shape: torch.Size([1, 1, 35555, 35555]) Position ids shape: torch.Size([1, 35555]) Input IDs shape: torch.Size([1, 35555]) Labels shape: torch.Size([1, 35555]) Final batch size: 1, sequence length: 26696 Attention mask shape: torch.Size([1, 1, 26696, 26696]) Final batch size: 1, sequence length: 26685 Position ids shape: torch.Size([1, 26696]) Attention mask shape: torch.Size([1, 1, 26685, 26685]) Input IDs shape: torch.Size([1, 26696]) Position ids shape: torch.Size([1, 26685]) Labels shape: torch.Size([1, 26696]) Input IDs shape: torch.Size([1, 26685]) Labels shape: torch.Size([1, 26685]) Final batch size: 1, sequence length: 32217 Attention mask shape: torch.Size([1, 1, 32217, 32217]) Position ids shape: torch.Size([1, 32217]) Input IDs shape: torch.Size([1, 32217]) Labels shape: torch.Size([1, 32217]) Final batch size: 1, sequence length: 24187 Attention mask shape: torch.Size([1, 1, 24187, 24187]) Position ids shape: torch.Size([1, 24187]) Input IDs shape: torch.Size([1, 24187]) Labels shape: torch.Size([1, 24187]) Final batch size: 1, sequence length: 29286 Attention mask shape: torch.Size([1, 1, 29286, 29286]) Position ids shape: torch.Size([1, 29286]) Input IDs shape: torch.Size([1, 29286]) Labels shape: torch.Size([1, 29286]) Final batch size: 1, sequence length: 18734 Attention mask shape: torch.Size([1, 1, 18734, 18734]) Position ids shape: torch.Size([1, 18734]) Input IDs shape: torch.Size([1, 18734]) Labels shape: torch.Size([1, 18734]) Final batch size: 1, sequence length: 29085 Attention mask shape: torch.Size([1, 1, 29085, 29085]) Position ids shape: torch.Size([1, 29085]) Input IDs shape: torch.Size([1, 29085]) Labels shape: torch.Size([1, 29085]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 39018 Attention mask shape: torch.Size([1, 1, 39018, 39018]) Position ids shape: torch.Size([1, 39018]) Input IDs shape: torch.Size([1, 39018]) Labels shape: torch.Size([1, 39018]) Final batch size: 1, sequence length: 35949 Attention mask shape: torch.Size([1, 1, 35949, 35949]) Position ids shape: torch.Size([1, 35949]) Input IDs shape: torch.Size([1, 35949]) Labels shape: torch.Size([1, 35949]) Final batch size: 1, sequence length: 31991 Attention mask shape: torch.Size([1, 1, 31991, 31991]) Position ids shape: torch.Size([1, 31991]) Input IDs shape: torch.Size([1, 31991]) Labels shape: torch.Size([1, 31991]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 31115 Attention mask shape: torch.Size([1, 1, 31115, 31115]) Position ids shape: torch.Size([1, 31115]) Input IDs shape: torch.Size([1, 31115]) Labels shape: torch.Size([1, 31115]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 19138 Attention mask shape: torch.Size([1, 1, 19138, 19138]) Position ids shape: torch.Size([1, 19138]) Input IDs shape: torch.Size([1, 19138]) Labels shape: torch.Size([1, 19138]) Final batch size: 1, sequence length: 23848 Attention mask shape: torch.Size([1, 1, 23848, 23848]) Position ids shape: torch.Size([1, 23848]) Input IDs shape: torch.Size([1, 23848]) Labels shape: torch.Size([1, 23848]) Final batch size: 1, sequence length: 22500 Attention mask shape: torch.Size([1, 1, 22500, 22500]) Position ids shape: torch.Size([1, 22500]) Input IDs shape: torch.Size([1, 22500]) Labels shape: torch.Size([1, 22500]) Final batch size: 1, sequence length: 25990 Attention mask shape: torch.Size([1, 1, 25990, 25990]) Position ids shape: torch.Size([1, 25990]) Input IDs shape: torch.Size([1, 25990]) Labels shape: torch.Size([1, 25990]) Final batch size: 1, sequence length: 32762 Attention mask shape: torch.Size([1, 1, 32762, 32762]) Position ids shape: torch.Size([1, 32762]) Input IDs shape: torch.Size([1, 32762]) Labels shape: torch.Size([1, 32762]) Final batch size: 1, sequence length: 37680 Attention mask shape: torch.Size([1, 1, 37680, 37680]) Position ids shape: torch.Size([1, 37680]) Input IDs shape: torch.Size([1, 37680]) Labels shape: torch.Size([1, 37680]) Final batch size: 1, sequence length: 22778 Attention mask shape: torch.Size([1, 1, 22778, 22778]) Position ids shape: torch.Size([1, 22778]) Input IDs shape: torch.Size([1, 22778]) Labels shape: torch.Size([1, 22778]) Final batch size: 1, sequence length: 40834 Attention mask shape: torch.Size([1, 1, 40834, 40834]) Position ids shape: torch.Size([1, 40834]) Input IDs shape: torch.Size([1, 40834]) Labels shape: torch.Size([1, 40834]) Final batch size: 1, sequence length: 32675 Attention mask shape: torch.Size([1, 1, 32675, 32675]) Position ids shape: torch.Size([1, 32675]) Input IDs shape: torch.Size([1, 32675]) Labels shape: torch.Size([1, 32675]) Final batch size: 1, sequence length: 34628 Attention mask shape: torch.Size([1, 1, 34628, 34628]) Position ids shape: torch.Size([1, 34628]) Input IDs shape: torch.Size([1, 34628]) Labels shape: torch.Size([1, 34628]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 26372 Attention mask shape: torch.Size([1, 1, 26372, 26372]) Position ids shape: torch.Size([1, 26372]) Input IDs shape: torch.Size([1, 26372]) Labels shape: torch.Size([1, 26372]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 34791 Attention mask shape: torch.Size([1, 1, 34791, 34791]) Position ids shape: torch.Size([1, 34791]) Input IDs shape: torch.Size([1, 34791]) Labels shape: torch.Size([1, 34791]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) {'loss': 0.2569, 'grad_norm': 0.31930238061360666, 'learning_rate': 3.960441545911205e-06, 'num_tokens': -inf, 'epoch': 4.88} Final batch size: 1, sequence length: 5686 Attention mask shape: torch.Size([1, 1, 5686, 5686]) Position ids shape: torch.Size([1, 5686]) Input IDs shape: torch.Size([1, 5686]) Labels shape: torch.Size([1, 5686]) Final batch size: 1, sequence length: 6986 Attention mask shape: torch.Size([1, 1, 6986, 6986]) Position ids shape: torch.Size([1, 6986]) Input IDs shape: torch.Size([1, 6986]) Labels shape: torch.Size([1, 6986]) Final batch size: 1, sequence length: 11304 Attention mask shape: torch.Size([1, 1, 11304, 11304]) Position ids shape: torch.Size([1, 11304]) Input IDs shape: torch.Size([1, 11304]) Labels shape: torch.Size([1, 11304]) Final batch size: 1, sequence length: 12847 Attention mask shape: torch.Size([1, 1, 12847, 12847]) Position ids shape: torch.Size([1, 12847]) Input IDs shape: torch.Size([1, 12847]) Labels shape: torch.Size([1, 12847]) Final batch size: 1, sequence length: 13224 Attention mask shape: torch.Size([1, 1, 13224, 13224]) Position ids shape: torch.Size([1, 13224]) Input IDs shape: torch.Size([1, 13224]) Labels shape: torch.Size([1, 13224]) Final batch size: 1, sequence length: 14168 Attention mask shape: torch.Size([1, 1, 14168, 14168]) Position ids shape: torch.Size([1, 14168]) Input IDs shape: torch.Size([1, 14168]) Labels shape: torch.Size([1, 14168]) Final batch size: 1, sequence length: 13872 Final batch size: 1, sequence length: 14304 Attention mask shape: torch.Size([1, 1, 13872, 13872]) Position ids shape: torch.Size([1, 13872]) Input IDs shape: torch.Size([1, 13872]) Labels shape: torch.Size([1, 13872]) Attention mask shape: torch.Size([1, 1, 14304, 14304]) Position ids shape: torch.Size([1, 14304]) Input IDs shape: torch.Size([1, 14304]) Labels shape: torch.Size([1, 14304]) Final batch size: 1, sequence length: 15621 Attention mask shape: torch.Size([1, 1, 15621, 15621]) Position ids shape: torch.Size([1, 15621]) Input IDs shape: torch.Size([1, 15621]) Labels shape: torch.Size([1, 15621]) Final batch size: 1, sequence length: 18246 Attention mask shape: torch.Size([1, 1, 18246, 18246]) Position ids shape: torch.Size([1, 18246]) Input IDs shape: torch.Size([1, 18246]) Labels shape: torch.Size([1, 18246]) Final batch size: 1, sequence length: 16514 Attention mask shape: torch.Size([1, 1, 16514, 16514]) Position ids shape: torch.Size([1, 16514]) Input IDs shape: torch.Size([1, 16514]) Labels shape: torch.Size([1, 16514]) Final batch size: 1, sequence length: 17852 Attention mask shape: torch.Size([1, 1, 17852, 17852]) Position ids shape: torch.Size([1, 17852]) Input IDs shape: torch.Size([1, 17852]) Labels shape: torch.Size([1, 17852]) Final batch size: 1, sequence length: 17213 Attention mask shape: torch.Size([1, 1, 17213, 17213]) Position ids shape: torch.Size([1, 17213]) Input IDs shape: torch.Size([1, 17213]) Labels shape: torch.Size([1, 17213]) Final batch size: 1, sequence length: 17773 Attention mask shape: torch.Size([1, 1, 17773, 17773]) Position ids shape: torch.Size([1, 17773]) Input IDs shape: torch.Size([1, 17773]) Labels shape: torch.Size([1, 17773]) Final batch size: 1, sequence length: 16440 Attention mask shape: torch.Size([1, 1, 16440, 16440]) Position ids shape: torch.Size([1, 16440]) Input IDs shape: torch.Size([1, 16440]) Labels shape: torch.Size([1, 16440]) Final batch size: 1, sequence length: 15652 Attention mask shape: torch.Size([1, 1, 15652, 15652]) Position ids shape: torch.Size([1, 15652]) Input IDs shape: torch.Size([1, 15652]) Labels shape: torch.Size([1, 15652]) Final batch size: 1, sequence length: 18044 Attention mask shape: torch.Size([1, 1, 18044, 18044]) Position ids shape: torch.Size([1, 18044]) Input IDs shape: torch.Size([1, 18044]) Labels shape: torch.Size([1, 18044]) Final batch size: 1, sequence length: 16501 Attention mask shape: torch.Size([1, 1, 16501, 16501]) Position ids shape: torch.Size([1, 16501]) Input IDs shape: torch.Size([1, 16501]) Labels shape: torch.Size([1, 16501]) Final batch size: 1, sequence length: 16378 Attention mask shape: torch.Size([1, 1, 16378, 16378]) Position ids shape: torch.Size([1, 16378]) Input IDs shape: torch.Size([1, 16378]) Labels shape: torch.Size([1, 16378]) Final batch size: 1, sequence length: 18155 Attention mask shape: torch.Size([1, 1, 18155, 18155]) Position ids shape: torch.Size([1, 18155]) Input IDs shape: torch.Size([1, 18155]) Labels shape: torch.Size([1, 18155]) Final batch size: 1, sequence length: 18289 Attention mask shape: torch.Size([1, 1, 18289, 18289]) Position ids shape: torch.Size([1, 18289]) Input IDs shape: torch.Size([1, 18289]) Labels shape: torch.Size([1, 18289]) Final batch size: 1, sequence length: 21163 Attention mask shape: torch.Size([1, 1, 21163, 21163]) Position ids shape: torch.Size([1, 21163]) Input IDs shape: torch.Size([1, 21163]) Labels shape: torch.Size([1, 21163]) Final batch size: 1, sequence length: 21316 Attention mask shape: torch.Size([1, 1, 21316, 21316]) Position ids shape: torch.Size([1, 21316]) Input IDs shape: torch.Size([1, 21316]) Labels shape: torch.Size([1, 21316]) Final batch size: 1, sequence length: 22117 Attention mask shape: torch.Size([1, 1, 22117, 22117]) Position ids shape: torch.Size([1, 22117]) Input IDs shape: torch.Size([1, 22117]) Labels shape: torch.Size([1, 22117]) Final batch size: 1, sequence length: 20813 Attention mask shape: torch.Size([1, 1, 20813, 20813]) Position ids shape: torch.Size([1, 20813]) Input IDs shape: torch.Size([1, 20813]) Labels shape: torch.Size([1, 20813]) Final batch size: 1, sequence length: 21207 Attention mask shape: torch.Size([1, 1, 21207, 21207]) Position ids shape: torch.Size([1, 21207]) Input IDs shape: torch.Size([1, 21207]) Labels shape: torch.Size([1, 21207]) Final batch size: 1, sequence length: 22784 Attention mask shape: torch.Size([1, 1, 22784, 22784]) Position ids shape: torch.Size([1, 22784]) Input IDs shape: torch.Size([1, 22784]) Labels shape: torch.Size([1, 22784]) Final batch size: 1, sequence length: 20176 Attention mask shape: torch.Size([1, 1, 20176, 20176]) Position ids shape: torch.Size([1, 20176]) Input IDs shape: torch.Size([1, 20176]) Labels shape: torch.Size([1, 20176]) Final batch size: 1, sequence length: 21562 Attention mask shape: torch.Size([1, 1, 21562, 21562]) Position ids shape: torch.Size([1, 21562]) Input IDs shape: torch.Size([1, 21562]) Labels shape: torch.Size([1, 21562]) Final batch size: 1, sequence length: 22152 Attention mask shape: torch.Size([1, 1, 22152, 22152]) Position ids shape: torch.Size([1, 22152]) Input IDs shape: torch.Size([1, 22152]) Labels shape: torch.Size([1, 22152]) Final batch size: 1, sequence length: 21607 Attention mask shape: torch.Size([1, 1, 21607, 21607]) Position ids shape: torch.Size([1, 21607]) Input IDs shape: torch.Size([1, 21607]) Labels shape: torch.Size([1, 21607]) Final batch size: 1, sequence length: 20766 Attention mask shape: torch.Size([1, 1, 20766, 20766]) Position ids shape: torch.Size([1, 20766]) Input IDs shape: torch.Size([1, 20766]) Labels shape: torch.Size([1, 20766]) Final batch size: 1, sequence length: 21705 Attention mask shape: torch.Size([1, 1, 21705, 21705]) Position ids shape: torch.Size([1, 21705]) Input IDs shape: torch.Size([1, 21705]) Labels shape: torch.Size([1, 21705]) Final batch size: 1, sequence length: 22197 Attention mask shape: torch.Size([1, 1, 22197, 22197]) Position ids shape: torch.Size([1, 22197]) Input IDs shape: torch.Size([1, 22197]) Labels shape: torch.Size([1, 22197]) Final batch size: 1, sequence length: 25880 Attention mask shape: torch.Size([1, 1, 25880, 25880]) Position ids shape: torch.Size([1, 25880]) Input IDs shape: torch.Size([1, 25880]) Labels shape: torch.Size([1, 25880]) Final batch size: 1, sequence length: 26494 Attention mask shape: torch.Size([1, 1, 26494, 26494]) Position ids shape: torch.Size([1, 26494]) Input IDs shape: torch.Size([1, 26494]) Labels shape: torch.Size([1, 26494]) Final batch size: 1, sequence length: 25053 Attention mask shape: torch.Size([1, 1, 25053, 25053]) Position ids shape: torch.Size([1, 25053]) Input IDs shape: torch.Size([1, 25053]) Labels shape: torch.Size([1, 25053]) Final batch size: 1, sequence length: 27005 Attention mask shape: torch.Size([1, 1, 27005, 27005]) Position ids shape: torch.Size([1, 27005]) Input IDs shape: torch.Size([1, 27005]) Labels shape: torch.Size([1, 27005]) Final batch size: 1, sequence length: 26921 Attention mask shape: torch.Size([1, 1, 26921, 26921]) Position ids shape: torch.Size([1, 26921]) Input IDs shape: torch.Size([1, 26921]) Labels shape: torch.Size([1, 26921]) Final batch size: 1, sequence length: 26303 Attention mask shape: torch.Size([1, 1, 26303, 26303]) Position ids shape: torch.Size([1, 26303]) Input IDs shape: torch.Size([1, 26303]) Labels shape: torch.Size([1, 26303]) Final batch size: 1, sequence length: 28523 Attention mask shape: torch.Size([1, 1, 28523, 28523]) Position ids shape: torch.Size([1, 28523]) Input IDs shape: torch.Size([1, 28523]) Labels shape: torch.Size([1, 28523]) Final batch size: 1, sequence length: 30592 Attention mask shape: torch.Size([1, 1, 30592, 30592]) Position ids shape: torch.Size([1, 30592]) Input IDs shape: torch.Size([1, 30592]) Labels shape: torch.Size([1, 30592]) Final batch size: 1, sequence length: 30151 Attention mask shape: torch.Size([1, 1, 30151, 30151]) Position ids shape: torch.Size([1, 30151]) Input IDs shape: torch.Size([1, 30151]) Labels shape: torch.Size([1, 30151]) Final batch size: 1, sequence length: 32473 Attention mask shape: torch.Size([1, 1, 32473, 32473]) Position ids shape: torch.Size([1, 32473]) Input IDs shape: torch.Size([1, 32473]) Labels shape: torch.Size([1, 32473]) Final batch size: 1, sequence length: 33482 Attention mask shape: torch.Size([1, 1, 33482, 33482]) Position ids shape: torch.Size([1, 33482]) Input IDs shape: torch.Size([1, 33482]) Labels shape: torch.Size([1, 33482]) Final batch size: 1, sequence length: 32596 Attention mask shape: torch.Size([1, 1, 32596, 32596]) Position ids shape: torch.Size([1, 32596]) Input IDs shape: torch.Size([1, 32596]) Labels shape: torch.Size([1, 32596]) Final batch size: 1, sequence length: 35030 Attention mask shape: torch.Size([1, 1, 35030, 35030]) Position ids shape: torch.Size([1, 35030]) Input IDs shape: torch.Size([1, 35030]) Labels shape: torch.Size([1, 35030]) Final batch size: 1, sequence length: 35485 Attention mask shape: torch.Size([1, 1, 35485, 35485]) Position ids shape: torch.Size([1, 35485]) Input IDs shape: torch.Size([1, 35485]) Labels shape: torch.Size([1, 35485]) Final batch size: 1, sequence length: 28897 Attention mask shape: torch.Size([1, 1, 28897, 28897]) Position ids shape: torch.Size([1, 28897]) Input IDs shape: torch.Size([1, 28897]) Labels shape: torch.Size([1, 28897]) Final batch size: 1, sequence length: 35100 Attention mask shape: torch.Size([1, 1, 35100, 35100]) Position ids shape: torch.Size([1, 35100]) Input IDs shape: torch.Size([1, 35100]) Labels shape: torch.Size([1, 35100]) Final batch size: 1, sequence length: 35866 Attention mask shape: torch.Size([1, 1, 35866, 35866]) Position ids shape: torch.Size([1, 35866]) Input IDs shape: torch.Size([1, 35866]) Labels shape: torch.Size([1, 35866]) Final batch size: 1, sequence length: 35523 Attention mask shape: torch.Size([1, 1, 35523, 35523]) Position ids shape: torch.Size([1, 35523]) Input IDs shape: torch.Size([1, 35523]) Labels shape: torch.Size([1, 35523]) Final batch size: 1, sequence length: 34469 Attention mask shape: torch.Size([1, 1, 34469, 34469]) Position ids shape: torch.Size([1, 34469]) Input IDs shape: torch.Size([1, 34469]) Labels shape: torch.Size([1, 34469]) Final batch size: 1, sequence length: 37980 Attention mask shape: torch.Size([1, 1, 37980, 37980]) Position ids shape: torch.Size([1, 37980]) Input IDs shape: torch.Size([1, 37980]) Labels shape: torch.Size([1, 37980]) Final batch size: 1, sequence length: 37040 Attention mask shape: torch.Size([1, 1, 37040, 37040]) Position ids shape: torch.Size([1, 37040]) Input IDs shape: torch.Size([1, 37040]) Labels shape: torch.Size([1, 37040]) Final batch size: 1, sequence length: 40526 Attention mask shape: torch.Size([1, 1, 40526, 40526]) Position ids shape: torch.Size([1, 40526]) Input IDs shape: torch.Size([1, 40526]) Labels shape: torch.Size([1, 40526]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 39589 Attention mask shape: torch.Size([1, 1, 39589, 39589]) Position ids shape: torch.Size([1, 39589]) Input IDs shape: torch.Size([1, 39589]) Labels shape: torch.Size([1, 39589]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 38927 Attention mask shape: torch.Size([1, 1, 38927, 38927]) Position ids shape: torch.Size([1, 38927]) Input IDs shape: torch.Size([1, 38927]) Labels shape: torch.Size([1, 38927]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) {'loss': 0.2499, 'grad_norm': 0.3170515405232184, 'learning_rate': 3.705904774487396e-06, 'num_tokens': -inf, 'epoch': 5.0} Final batch size: 1, sequence length: 5686 Attention mask shape: torch.Size([1, 1, 5686, 5686]) Position ids shape: torch.Size([1, 5686]) Input IDs shape: torch.Size([1, 5686]) Labels shape: torch.Size([1, 5686]) Final batch size: 1, sequence length: 6986 Attention mask shape: torch.Size([1, 1, 6986, 6986]) Position ids shape: torch.Size([1, 6986]) Input IDs shape: torch.Size([1, 6986]) Labels shape: torch.Size([1, 6986]) Final batch size: 1, sequence length: 6362 Attention mask shape: torch.Size([1, 1, 6362, 6362]) Position ids shape: torch.Size([1, 6362]) Input IDs shape: torch.Size([1, 6362]) Labels shape: torch.Size([1, 6362]) Final batch size: 1, sequence length: 10523 Attention mask shape: torch.Size([1, 1, 10523, 10523]) Position ids shape: torch.Size([1, 10523]) Input IDs shape: torch.Size([1, 10523]) Labels shape: torch.Size([1, 10523]) Final batch size: 1, sequence length: 11304 Attention mask shape: torch.Size([1, 1, 11304, 11304]) Position ids shape: torch.Size([1, 11304]) Input IDs shape: torch.Size([1, 11304]) Labels shape: torch.Size([1, 11304]) Final batch size: 1, sequence length: 12847 Attention mask shape: torch.Size([1, 1, 12847, 12847]) Position ids shape: torch.Size([1, 12847]) Input IDs shape: torch.Size([1, 12847]) Labels shape: torch.Size([1, 12847]) Final batch size: 1, sequence length: 14168 Final batch size: 1, sequence length: 14304 Attention mask shape: torch.Size([1, 1, 14168, 14168]) Attention mask shape: torch.Size([1, 1, 14304, 14304]) Position ids shape: torch.Size([1, 14168]) Input IDs shape: torch.Size([1, 14168]) Labels shape: torch.Size([1, 14168]) Position ids shape: torch.Size([1, 14304]) Input IDs shape: torch.Size([1, 14304]) Labels shape: torch.Size([1, 14304]) Final batch size: 1, sequence length: 13224 Attention mask shape: torch.Size([1, 1, 13224, 13224]) Position ids shape: torch.Size([1, 13224]) Input IDs shape: torch.Size([1, 13224]) Labels shape: torch.Size([1, 13224]) Final batch size: 1, sequence length: 13872 Attention mask shape: torch.Size([1, 1, 13872, 13872]) Position ids shape: torch.Size([1, 13872]) Input IDs shape: torch.Size([1, 13872]) Labels shape: torch.Size([1, 13872]) Final batch size: 1, sequence length: 15652 Attention mask shape: torch.Size([1, 1, 15652, 15652]) Position ids shape: torch.Size([1, 15652]) Input IDs shape: torch.Size([1, 15652]) Labels shape: torch.Size([1, 15652]) Final batch size: 1, sequence length: 15621 Attention mask shape: torch.Size([1, 1, 15621, 15621]) Position ids shape: torch.Size([1, 15621]) Input IDs shape: torch.Size([1, 15621]) Labels shape: torch.Size([1, 15621]) Final batch size: 1, sequence length: 17213 Attention mask shape: torch.Size([1, 1, 17213, 17213]) Position ids shape: torch.Size([1, 17213]) Input IDs shape: torch.Size([1, 17213]) Labels shape: torch.Size([1, 17213]) Final batch size: 1, sequence length: 17852 Attention mask shape: torch.Size([1, 1, 17852, 17852]) Position ids shape: torch.Size([1, 17852]) Input IDs shape: torch.Size([1, 17852]) Labels shape: torch.Size([1, 17852]) Final batch size: 1, sequence length: 16514 Attention mask shape: torch.Size([1, 1, 16514, 16514]) Position ids shape: torch.Size([1, 16514]) Input IDs shape: torch.Size([1, 16514]) Labels shape: torch.Size([1, 16514]) Final batch size: 1, sequence length: 16378 Attention mask shape: torch.Size([1, 1, 16378, 16378]) Position ids shape: torch.Size([1, 16378]) Input IDs shape: torch.Size([1, 16378]) Labels shape: torch.Size([1, 16378]) Final batch size: 1, sequence length: 18155 Attention mask shape: torch.Size([1, 1, 18155, 18155]) Position ids shape: torch.Size([1, 18155]) Input IDs shape: torch.Size([1, 18155]) Labels shape: torch.Size([1, 18155]) Final batch size: 1, sequence length: 18044 Attention mask shape: torch.Size([1, 1, 18044, 18044]) Position ids shape: torch.Size([1, 18044]) Input IDs shape: torch.Size([1, 18044]) Labels shape: torch.Size([1, 18044]) Final batch size: 1, sequence length: 18246 Attention mask shape: torch.Size([1, 1, 18246, 18246]) Position ids shape: torch.Size([1, 18246]) Input IDs shape: torch.Size([1, 18246]) Labels shape: torch.Size([1, 18246]) Final batch size: 1, sequence length: 16501 Attention mask shape: torch.Size([1, 1, 16501, 16501]) Position ids shape: torch.Size([1, 16501]) Input IDs shape: torch.Size([1, 16501]) Labels shape: torch.Size([1, 16501]) Final batch size: 1, sequence length: 5911 Attention mask shape: torch.Size([1, 1, 5911, 5911]) Position ids shape: torch.Size([1, 5911]) Input IDs shape: torch.Size([1, 5911]) Labels shape: torch.Size([1, 5911]) Final batch size: 1, sequence length: 20176 Attention mask shape: torch.Size([1, 1, 20176, 20176]) Position ids shape: torch.Size([1, 20176]) Input IDs shape: torch.Size([1, 20176]) Labels shape: torch.Size([1, 20176]) Final batch size: 1, sequence length: 18289 Attention mask shape: torch.Size([1, 1, 18289, 18289]) Position ids shape: torch.Size([1, 18289]) Input IDs shape: torch.Size([1, 18289]) Labels shape: torch.Size([1, 18289]) Final batch size: 1, sequence length: 21163 Attention mask shape: torch.Size([1, 1, 21163, 21163]) Position ids shape: torch.Size([1, 21163]) Input IDs shape: torch.Size([1, 21163]) Labels shape: torch.Size([1, 21163]) Final batch size: 1, sequence length: 21705 Attention mask shape: torch.Size([1, 1, 21705, 21705]) Position ids shape: torch.Size([1, 21705]) Input IDs shape: torch.Size([1, 21705]) Labels shape: torch.Size([1, 21705]) Final batch size: 1, sequence length: 21607 Attention mask shape: torch.Size([1, 1, 21607, 21607]) Position ids shape: torch.Size([1, 21607]) Input IDs shape: torch.Size([1, 21607]) Labels shape: torch.Size([1, 21607]) Final batch size: 1, sequence length: 21316 Attention mask shape: torch.Size([1, 1, 21316, 21316]) Position ids shape: torch.Size([1, 21316]) Input IDs shape: torch.Size([1, 21316]) Labels shape: torch.Size([1, 21316]) Final batch size: 1, sequence length: 21562 Attention mask shape: torch.Size([1, 1, 21562, 21562]) Position ids shape: torch.Size([1, 21562]) Input IDs shape: torch.Size([1, 21562]) Labels shape: torch.Size([1, 21562]) Final batch size: 1, sequence length: 21962 Attention mask shape: torch.Size([1, 1, 21962, 21962]) Position ids shape: torch.Size([1, 21962]) Input IDs shape: torch.Size([1, 21962]) Labels shape: torch.Size([1, 21962]) Final batch size: 1, sequence length: 22197 Attention mask shape: torch.Size([1, 1, 22197, 22197]) Position ids shape: torch.Size([1, 22197]) Input IDs shape: torch.Size([1, 22197]) Labels shape: torch.Size([1, 22197]) Final batch size: 1, sequence length: 17773 Attention mask shape: torch.Size([1, 1, 17773, 17773]) Position ids shape: torch.Size([1, 17773]) Input IDs shape: torch.Size([1, 17773]) Labels shape: torch.Size([1, 17773]) Final batch size: 1, sequence length: 22784 Attention mask shape: torch.Size([1, 1, 22784, 22784]) Position ids shape: torch.Size([1, 22784]) Input IDs shape: torch.Size([1, 22784]) Labels shape: torch.Size([1, 22784]) Final batch size: 1, sequence length: 18545 Attention mask shape: torch.Size([1, 1, 18545, 18545]) Position ids shape: torch.Size([1, 18545]) Input IDs shape: torch.Size([1, 18545]) Labels shape: torch.Size([1, 18545]) Final batch size: 1, sequence length: 20813 Attention mask shape: torch.Size([1, 1, 20813, 20813]) Position ids shape: torch.Size([1, 20813]) Input IDs shape: torch.Size([1, 20813]) Labels shape: torch.Size([1, 20813]) Final batch size: 1, sequence length: 16440 Attention mask shape: torch.Size([1, 1, 16440, 16440]) Position ids shape: torch.Size([1, 16440]) Input IDs shape: torch.Size([1, 16440]) Labels shape: torch.Size([1, 16440]) Final batch size: 1, sequence length: 17910 Attention mask shape: torch.Size([1, 1, 17910, 17910]) Position ids shape: torch.Size([1, 17910]) Input IDs shape: torch.Size([1, 17910]) Labels shape: torch.Size([1, 17910]) Final batch size: 1, sequence length: 20766 Attention mask shape: torch.Size([1, 1, 20766, 20766]) Position ids shape: torch.Size([1, 20766]) Input IDs shape: torch.Size([1, 20766]) Labels shape: torch.Size([1, 20766]) Final batch size: 1, sequence length: 22152 Attention mask shape: torch.Size([1, 1, 22152, 22152]) Position ids shape: torch.Size([1, 22152]) Input IDs shape: torch.Size([1, 22152]) Labels shape: torch.Size([1, 22152]) Final batch size: 1, sequence length: 21207 Attention mask shape: torch.Size([1, 1, 21207, 21207]) Position ids shape: torch.Size([1, 21207]) Input IDs shape: torch.Size([1, 21207]) Labels shape: torch.Size([1, 21207]) Final batch size: 1, sequence length: 25053 Attention mask shape: torch.Size([1, 1, 25053, 25053]) Position ids shape: torch.Size([1, 25053]) Input IDs shape: torch.Size([1, 25053]) Labels shape: torch.Size([1, 25053]) Final batch size: 1, sequence length: 22117 Attention mask shape: torch.Size([1, 1, 22117, 22117]) Position ids shape: torch.Size([1, 22117]) Input IDs shape: torch.Size([1, 22117]) Labels shape: torch.Size([1, 22117]) Final batch size: 1, sequence length: 26303 Attention mask shape: torch.Size([1, 1, 26303, 26303]) Position ids shape: torch.Size([1, 26303]) Input IDs shape: torch.Size([1, 26303]) Labels shape: torch.Size([1, 26303]) Final batch size: 1, sequence length: 24365 Attention mask shape: torch.Size([1, 1, 24365, 24365]) Position ids shape: torch.Size([1, 24365]) Input IDs shape: torch.Size([1, 24365]) Labels shape: torch.Size([1, 24365]) Final batch size: 1, sequence length: 19962 Attention mask shape: torch.Size([1, 1, 19962, 19962]) Position ids shape: torch.Size([1, 19962]) Input IDs shape: torch.Size([1, 19962]) Labels shape: torch.Size([1, 19962]) Final batch size: 1, sequence length: 14496 Attention mask shape: torch.Size([1, 1, 14496, 14496]) Position ids shape: torch.Size([1, 14496]) Input IDs shape: torch.Size([1, 14496]) Labels shape: torch.Size([1, 14496]) Final batch size: 1, sequence length: 23338 Attention mask shape: torch.Size([1, 1, 23338, 23338]) Position ids shape: torch.Size([1, 23338]) Input IDs shape: torch.Size([1, 23338]) Labels shape: torch.Size([1, 23338]) Final batch size: 1, sequence length: 20101 Attention mask shape: torch.Size([1, 1, 20101, 20101]) Position ids shape: torch.Size([1, 20101]) Input IDs shape: torch.Size([1, 20101]) Labels shape: torch.Size([1, 20101]) Final batch size: 1, sequence length: 12421 Attention mask shape: torch.Size([1, 1, 12421, 12421]) Position ids shape: torch.Size([1, 12421]) Input IDs shape: torch.Size([1, 12421]) Labels shape: torch.Size([1, 12421]) Final batch size: 1, sequence length: 26494 Attention mask shape: torch.Size([1, 1, 26494, 26494]) Position ids shape: torch.Size([1, 26494]) Input IDs shape: torch.Size([1, 26494]) Labels shape: torch.Size([1, 26494]) Final batch size: 1, sequence length: 27633 Attention mask shape: torch.Size([1, 1, 27633, 27633]) Position ids shape: torch.Size([1, 27633]) Input IDs shape: torch.Size([1, 27633]) Labels shape: torch.Size([1, 27633]) Final batch size: 1, sequence length: 13031 Attention mask shape: torch.Size([1, 1, 13031, 13031]) Position ids shape: torch.Size([1, 13031]) Input IDs shape: torch.Size([1, 13031]) Labels shape: torch.Size([1, 13031]) Final batch size: 1, sequence length: 25880 Attention mask shape: torch.Size([1, 1, 25880, 25880]) Position ids shape: torch.Size([1, 25880]) Input IDs shape: torch.Size([1, 25880]) Labels shape: torch.Size([1, 25880]) Final batch size: 1, sequence length: 21766 Attention mask shape: torch.Size([1, 1, 21766, 21766]) Position ids shape: torch.Size([1, 21766]) Input IDs shape: torch.Size([1, 21766]) Labels shape: torch.Size([1, 21766]) Final batch size: 1, sequence length: 11515 Attention mask shape: torch.Size([1, 1, 11515, 11515]) Position ids shape: torch.Size([1, 11515]) Input IDs shape: torch.Size([1, 11515]) Labels shape: torch.Size([1, 11515]) Final batch size: 1, sequence length: 21672 Attention mask shape: torch.Size([1, 1, 21672, 21672]) Position ids shape: torch.Size([1, 21672]) Input IDs shape: torch.Size([1, 21672]) Labels shape: torch.Size([1, 21672]) Final batch size: 1, sequence length: 26921 Attention mask shape: torch.Size([1, 1, 26921, 26921]) Position ids shape: torch.Size([1, 26921]) Input IDs shape: torch.Size([1, 26921]) Labels shape: torch.Size([1, 26921]) Final batch size: 1, sequence length: 18470 Attention mask shape: torch.Size([1, 1, 18470, 18470]) Position ids shape: torch.Size([1, 18470]) Input IDs shape: torch.Size([1, 18470]) Labels shape: torch.Size([1, 18470]) Final batch size: 1, sequence length: 20184 Attention mask shape: torch.Size([1, 1, 20184, 20184]) Position ids shape: torch.Size([1, 20184]) Input IDs shape: torch.Size([1, 20184]) Labels shape: torch.Size([1, 20184]) Final batch size: 1, sequence length: 23975 Attention mask shape: torch.Size([1, 1, 23975, 23975]) Position ids shape: torch.Size([1, 23975]) Input IDs shape: torch.Size([1, 23975]) Labels shape: torch.Size([1, 23975]) Final batch size: 1, sequence length: 29404 Attention mask shape: torch.Size([1, 1, 29404, 29404]) Position ids shape: torch.Size([1, 29404]) Input IDs shape: torch.Size([1, 29404]) Labels shape: torch.Size([1, 29404]) Final batch size: 1, sequence length: 30592 Attention mask shape: torch.Size([1, 1, 30592, 30592]) Position ids shape: torch.Size([1, 30592]) Input IDs shape: torch.Size([1, 30592]) Labels shape: torch.Size([1, 30592]) Final batch size: 1, sequence length: 28897 Attention mask shape: torch.Size([1, 1, 28897, 28897]) Position ids shape: torch.Size([1, 28897]) Input IDs shape: torch.Size([1, 28897]) Labels shape: torch.Size([1, 28897]) Final batch size: 1, sequence length: 12224 Attention mask shape: torch.Size([1, 1, 12224, 12224]) Position ids shape: torch.Size([1, 12224]) Input IDs shape: torch.Size([1, 12224]) Labels shape: torch.Size([1, 12224]) Final batch size: 1, sequence length: 26138 Attention mask shape: torch.Size([1, 1, 26138, 26138]) Position ids shape: torch.Size([1, 26138]) Input IDs shape: torch.Size([1, 26138]) Labels shape: torch.Size([1, 26138]) Final batch size: 1, sequence length: 16057 Attention mask shape: torch.Size([1, 1, 16057, 16057]) Position ids shape: torch.Size([1, 16057]) Input IDs shape: torch.Size([1, 16057]) Labels shape: torch.Size([1, 16057]) Final batch size: 1, sequence length: 32473 Attention mask shape: torch.Size([1, 1, 32473, 32473]) Position ids shape: torch.Size([1, 32473]) Input IDs shape: torch.Size([1, 32473]) Labels shape: torch.Size([1, 32473]) Final batch size: 1, sequence length: 28523 Attention mask shape: torch.Size([1, 1, 28523, 28523]) Position ids shape: torch.Size([1, 28523]) Input IDs shape: torch.Size([1, 28523]) Labels shape: torch.Size([1, 28523]) Final batch size: 1, sequence length: 23558 Attention mask shape: torch.Size([1, 1, 23558, 23558]) Position ids shape: torch.Size([1, 23558]) Input IDs shape: torch.Size([1, 23558]) Labels shape: torch.Size([1, 23558]) Final batch size: 1, sequence length: 33482 Attention mask shape: torch.Size([1, 1, 33482, 33482]) Position ids shape: torch.Size([1, 33482]) Input IDs shape: torch.Size([1, 33482]) Labels shape: torch.Size([1, 33482]) Final batch size: 1, sequence length: 9704 Attention mask shape: torch.Size([1, 1, 9704, 9704]) Position ids shape: torch.Size([1, 9704]) Input IDs shape: torch.Size([1, 9704]) Labels shape: torch.Size([1, 9704]) Final batch size: 1, sequence length: 30965 Attention mask shape: torch.Size([1, 1, 30965, 30965]) Position ids shape: torch.Size([1, 30965]) Input IDs shape: torch.Size([1, 30965]) Labels shape: torch.Size([1, 30965]) Final batch size: 1, sequence length: 11608 Attention mask shape: torch.Size([1, 1, 11608, 11608]) Position ids shape: torch.Size([1, 11608]) Input IDs shape: torch.Size([1, 11608]) Labels shape: torch.Size([1, 11608]) Final batch size: 1, sequence length: 21581 Attention mask shape: torch.Size([1, 1, 21581, 21581]) Position ids shape: torch.Size([1, 21581]) Input IDs shape: torch.Size([1, 21581]) Labels shape: torch.Size([1, 21581]) Final batch size: 1, sequence length: 17456 Attention mask shape: torch.Size([1, 1, 17456, 17456]) Position ids shape: torch.Size([1, 17456]) Input IDs shape: torch.Size([1, 17456]) Labels shape: torch.Size([1, 17456]) Final batch size: 1, sequence length: 35485 Attention mask shape: torch.Size([1, 1, 35485, 35485]) Position ids shape: torch.Size([1, 35485]) Input IDs shape: torch.Size([1, 35485]) Labels shape: torch.Size([1, 35485]) Final batch size: 1, sequence length: 34469 Attention mask shape: torch.Size([1, 1, 34469, 34469]) Position ids shape: torch.Size([1, 34469]) Input IDs shape: torch.Size([1, 34469]) Labels shape: torch.Size([1, 34469]) Final batch size: 1, sequence length: 20198 Attention mask shape: torch.Size([1, 1, 20198, 20198]) Position ids shape: torch.Size([1, 20198]) Input IDs shape: torch.Size([1, 20198]) Labels shape: torch.Size([1, 20198]) Final batch size: 1, sequence length: 29481 Attention mask shape: torch.Size([1, 1, 29481, 29481]) Position ids shape: torch.Size([1, 29481]) Input IDs shape: torch.Size([1, 29481]) Labels shape: torch.Size([1, 29481]) Final batch size: 1, sequence length: 24121 Attention mask shape: torch.Size([1, 1, 24121, 24121]) Position ids shape: torch.Size([1, 24121]) Input IDs shape: torch.Size([1, 24121]) Labels shape: torch.Size([1, 24121]) Final batch size: 1, sequence length: 18126 Attention mask shape: torch.Size([1, 1, 18126, 18126]) Position ids shape: torch.Size([1, 18126]) Input IDs shape: torch.Size([1, 18126]) Labels shape: torch.Size([1, 18126]) Final batch size: 1, sequence length: 32596 Attention mask shape: torch.Size([1, 1, 32596, 32596]) Position ids shape: torch.Size([1, 32596]) Input IDs shape: torch.Size([1, 32596]) Labels shape: torch.Size([1, 32596]) Final batch size: 1, sequence length: 35100 Attention mask shape: torch.Size([1, 1, 35100, 35100]) Position ids shape: torch.Size([1, 35100]) Input IDs shape: torch.Size([1, 35100]) Labels shape: torch.Size([1, 35100]) Final batch size: 1, sequence length: 28773 Attention mask shape: torch.Size([1, 1, 28773, 28773]) Position ids shape: torch.Size([1, 28773]) Input IDs shape: torch.Size([1, 28773]) Labels shape: torch.Size([1, 28773]) Final batch size: 1, sequence length: 19620 Attention mask shape: torch.Size([1, 1, 19620, 19620]) Position ids shape: torch.Size([1, 19620]) Input IDs shape: torch.Size([1, 19620]) Labels shape: torch.Size([1, 19620]) Final batch size: 1, sequence length: 18023 Attention mask shape: torch.Size([1, 1, 18023, 18023]) Position ids shape: torch.Size([1, 18023]) Input IDs shape: torch.Size([1, 18023]) Labels shape: torch.Size([1, 18023]) Final batch size: 1, sequence length: 37040 Attention mask shape: torch.Size([1, 1, 37040, 37040]) Position ids shape: torch.Size([1, 37040]) Input IDs shape: torch.Size([1, 37040]) Labels shape: torch.Size([1, 37040]) Final batch size: 1, sequence length: 35030 Attention mask shape: torch.Size([1, 1, 35030, 35030]) Position ids shape: torch.Size([1, 35030]) Input IDs shape: torch.Size([1, 35030]) Labels shape: torch.Size([1, 35030]) Final batch size: 1, sequence length: 11403 Attention mask shape: torch.Size([1, 1, 11403, 11403]) Position ids shape: torch.Size([1, 11403]) Input IDs shape: torch.Size([1, 11403]) Labels shape: torch.Size([1, 11403]) Final batch size: 1, sequence length: 16257 Attention mask shape: torch.Size([1, 1, 16257, 16257]) Position ids shape: torch.Size([1, 16257]) Input IDs shape: torch.Size([1, 16257]) Labels shape: torch.Size([1, 16257]) Final batch size: 1, sequence length: 30072 Attention mask shape: torch.Size([1, 1, 30072, 30072]) Position ids shape: torch.Size([1, 30072]) Input IDs shape: torch.Size([1, 30072]) Labels shape: torch.Size([1, 30072]) Final batch size: 1, sequence length: 18454 Attention mask shape: torch.Size([1, 1, 18454, 18454]) Position ids shape: torch.Size([1, 18454]) Input IDs shape: torch.Size([1, 18454]) Labels shape: torch.Size([1, 18454]) Final batch size: 1, sequence length: 30428 Attention mask shape: torch.Size([1, 1, 30428, 30428]) Position ids shape: torch.Size([1, 30428]) Input IDs shape: torch.Size([1, 30428]) Labels shape: torch.Size([1, 30428]) Final batch size: 1, sequence length: 30772 Attention mask shape: torch.Size([1, 1, 30772, 30772]) Position ids shape: torch.Size([1, 30772]) Input IDs shape: torch.Size([1, 30772]) Labels shape: torch.Size([1, 30772]) Final batch size: 1, sequence length: 37980 Attention mask shape: torch.Size([1, 1, 37980, 37980]) Position ids shape: torch.Size([1, 37980]) Input IDs shape: torch.Size([1, 37980]) Labels shape: torch.Size([1, 37980]) Final batch size: 1, sequence length: 39589 Attention mask shape: torch.Size([1, 1, 39589, 39589]) Position ids shape: torch.Size([1, 39589]) Input IDs shape: torch.Size([1, 39589]) Labels shape: torch.Size([1, 39589]) Final batch size: 1, sequence length: 17870 Attention mask shape: torch.Size([1, 1, 17870, 17870]) Position ids shape: torch.Size([1, 17870]) Input IDs shape: torch.Size([1, 17870]) Labels shape: torch.Size([1, 17870]) Final batch size: 1, sequence length: 29537 Attention mask shape: torch.Size([1, 1, 29537, 29537]) Position ids shape: torch.Size([1, 29537]) Input IDs shape: torch.Size([1, 29537]) Labels shape: torch.Size([1, 29537]) Final batch size: 1, sequence length: 21491 Attention mask shape: torch.Size([1, 1, 21491, 21491]) Position ids shape: torch.Size([1, 21491]) Input IDs shape: torch.Size([1, 21491]) Labels shape: torch.Size([1, 21491]) Final batch size: 1, sequence length: 26215 Attention mask shape: torch.Size([1, 1, 26215, 26215]) Position ids shape: torch.Size([1, 26215]) Input IDs shape: torch.Size([1, 26215]) Labels shape: torch.Size([1, 26215]) Final batch size: 1, sequence length: 30789 Attention mask shape: torch.Size([1, 1, 30789, 30789]) Position ids shape: torch.Size([1, 30789]) Input IDs shape: torch.Size([1, 30789]) Labels shape: torch.Size([1, 30789]) Final batch size: 1, sequence length: 30859 Attention mask shape: torch.Size([1, 1, 30859, 30859]) Position ids shape: torch.Size([1, 30859]) Input IDs shape: torch.Size([1, 30859]) Labels shape: torch.Size([1, 30859]) Final batch size: 1, sequence length: 26708 Attention mask shape: torch.Size([1, 1, 26708, 26708]) Position ids shape: torch.Size([1, 26708]) Input IDs shape: torch.Size([1, 26708]) Labels shape: torch.Size([1, 26708]) Final batch size: 1, sequence length: 19538 Attention mask shape: torch.Size([1, 1, 19538, 19538]) Position ids shape: torch.Size([1, 19538]) Input IDs shape: torch.Size([1, 19538]) Labels shape: torch.Size([1, 19538]) Final batch size: 1, sequence length: 29464 Attention mask shape: torch.Size([1, 1, 29464, 29464]) Position ids shape: torch.Size([1, 29464]) Input IDs shape: torch.Size([1, 29464]) Labels shape: torch.Size([1, 29464]) Final batch size: 1, sequence length: 27702 Attention mask shape: torch.Size([1, 1, 27702, 27702]) Position ids shape: torch.Size([1, 27702]) Input IDs shape: torch.Size([1, 27702]) Labels shape: torch.Size([1, 27702]) Final batch size: 1, sequence length: 19702 Attention mask shape: torch.Size([1, 1, 19702, 19702]) Position ids shape: torch.Size([1, 19702]) Input IDs shape: torch.Size([1, 19702]) Labels shape: torch.Size([1, 19702]) Final batch size: 1, sequence length: 17811 Attention mask shape: torch.Size([1, 1, 17811, 17811]) Position ids shape: torch.Size([1, 17811]) Input IDs shape: torch.Size([1, 17811]) Labels shape: torch.Size([1, 17811]) Final batch size: 1, sequence length: 37992 Attention mask shape: torch.Size([1, 1, 37992, 37992]) Position ids shape: torch.Size([1, 37992]) Input IDs shape: torch.Size([1, 37992]) Labels shape: torch.Size([1, 37992]) Final batch size: 1, sequence length: 13622 Attention mask shape: torch.Size([1, 1, 13622, 13622]) Position ids shape: torch.Size([1, 13622]) Input IDs shape: torch.Size([1, 13622]) Labels shape: torch.Size([1, 13622]) Final batch size: 1, sequence length: 26635 Attention mask shape: torch.Size([1, 1, 26635, 26635]) Position ids shape: torch.Size([1, 26635]) Input IDs shape: torch.Size([1, 26635]) Labels shape: torch.Size([1, 26635]) Final batch size: 1, sequence length: 29875 Attention mask shape: torch.Size([1, 1, 29875, 29875]) Position ids shape: torch.Size([1, 29875]) Input IDs shape: torch.Size([1, 29875]) Labels shape: torch.Size([1, 29875]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 15830 Attention mask shape: torch.Size([1, 1, 15830, 15830]) Position ids shape: torch.Size([1, 15830]) Input IDs shape: torch.Size([1, 15830]) Labels shape: torch.Size([1, 15830]) Final batch size: 1, sequence length: 22309 Attention mask shape: torch.Size([1, 1, 22309, 22309]) Position ids shape: torch.Size([1, 22309]) Input IDs shape: torch.Size([1, 22309]) Labels shape: torch.Size([1, 22309]) Final batch size: 1, sequence length: 38927 Attention mask shape: torch.Size([1, 1, 38927, 38927]) Position ids shape: torch.Size([1, 38927]) Input IDs shape: torch.Size([1, 38927]) Labels shape: torch.Size([1, 38927]) Final batch size: 1, sequence length: 21250 Attention mask shape: torch.Size([1, 1, 21250, 21250]) Position ids shape: torch.Size([1, 21250]) Input IDs shape: torch.Size([1, 21250]) Labels shape: torch.Size([1, 21250]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 33685 Attention mask shape: torch.Size([1, 1, 33685, 33685]) Position ids shape: torch.Size([1, 33685]) Input IDs shape: torch.Size([1, 33685]) Labels shape: torch.Size([1, 33685]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40526 Attention mask shape: torch.Size([1, 1, 40526, 40526]) Position ids shape: torch.Size([1, 40526]) Input IDs shape: torch.Size([1, 40526]) Labels shape: torch.Size([1, 40526]) Final batch size: 1, sequence length: 36580 Attention mask shape: torch.Size([1, 1, 36580, 36580]) Position ids shape: torch.Size([1, 36580]) Input IDs shape: torch.Size([1, 36580]) Labels shape: torch.Size([1, 36580]) Final batch size: 1, sequence length: 21936 Attention mask shape: torch.Size([1, 1, 21936, 21936]) Position ids shape: torch.Size([1, 21936]) Input IDs shape: torch.Size([1, 21936]) Labels shape: torch.Size([1, 21936]) Final batch size: 1, sequence length: 37866 Attention mask shape: torch.Size([1, 1, 37866, 37866]) Position ids shape: torch.Size([1, 37866]) Input IDs shape: torch.Size([1, 37866]) Labels shape: torch.Size([1, 37866]) Final batch size: 1, sequence length: 18606 Final batch size: 1, sequence length: 28814 Attention mask shape: torch.Size([1, 1, 28814, 28814]) Attention mask shape: torch.Size([1, 1, 18606, 18606]) Position ids shape: torch.Size([1, 18606]) Input IDs shape: torch.Size([1, 18606]) Position ids shape: torch.Size([1, 28814]) Input IDs shape: torch.Size([1, 28814]) Labels shape: torch.Size([1, 28814]) Labels shape: torch.Size([1, 18606]) Final batch size: 1, sequence length: 14730 Attention mask shape: torch.Size([1, 1, 14730, 14730]) Position ids shape: torch.Size([1, 14730]) Input IDs shape: torch.Size([1, 14730]) Labels shape: torch.Size([1, 14730]) Final batch size: 1, sequence length: 17623 Attention mask shape: torch.Size([1, 1, 17623, 17623]) Position ids shape: torch.Size([1, 17623]) Input IDs shape: torch.Size([1, 17623]) Labels shape: torch.Size([1, 17623]) Final batch size: 1, sequence length: 15993 Attention mask shape: torch.Size([1, 1, 15993, 15993]) Position ids shape: torch.Size([1, 15993]) Input IDs shape: torch.Size([1, 15993]) Labels shape: torch.Size([1, 15993]) Final batch size: 1, sequence length: 34684 Attention mask shape: torch.Size([1, 1, 34684, 34684]) Position ids shape: torch.Size([1, 34684]) Input IDs shape: torch.Size([1, 34684]) Labels shape: torch.Size([1, 34684]) Final batch size: 1, sequence length: 35077 Attention mask shape: torch.Size([1, 1, 35077, 35077]) Position ids shape: torch.Size([1, 35077]) Input IDs shape: torch.Size([1, 35077]) Labels shape: torch.Size([1, 35077]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 35952 Attention mask shape: torch.Size([1, 1, 35952, 35952]) Position ids shape: torch.Size([1, 35952]) Input IDs shape: torch.Size([1, 35952]) Labels shape: torch.Size([1, 35952]) Final batch size: 1, sequence length: 24232 Attention mask shape: torch.Size([1, 1, 24232, 24232]) Position ids shape: torch.Size([1, 24232]) Input IDs shape: torch.Size([1, 24232]) Labels shape: torch.Size([1, 24232]) Final batch size: 1, sequence length: 31745 Attention mask shape: torch.Size([1, 1, 31745, 31745]) Position ids shape: torch.Size([1, 31745]) Input IDs shape: torch.Size([1, 31745]) Labels shape: torch.Size([1, 31745]) Final batch size: 1, sequence length: 29639 Attention mask shape: torch.Size([1, 1, 29639, 29639]) Position ids shape: torch.Size([1, 29639]) Input IDs shape: torch.Size([1, 29639]) Labels shape: torch.Size([1, 29639]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 37407 Attention mask shape: torch.Size([1, 1, 37407, 37407]) Position ids shape: torch.Size([1, 37407]) Input IDs shape: torch.Size([1, 37407]) Labels shape: torch.Size([1, 37407]) Final batch size: 1, sequence length: 19028 Attention mask shape: torch.Size([1, 1, 19028, 19028]) Position ids shape: torch.Size([1, 19028]) Input IDs shape: torch.Size([1, 19028]) Labels shape: torch.Size([1, 19028]) Final batch size: 1, sequence length: 16270 Attention mask shape: torch.Size([1, 1, 16270, 16270]) Position ids shape: torch.Size([1, 16270]) Input IDs shape: torch.Size([1, 16270]) Labels shape: torch.Size([1, 16270]) Final batch size: 1, sequence length: 36716 Attention mask shape: torch.Size([1, 1, 36716, 36716]) Position ids shape: torch.Size([1, 36716]) Input IDs shape: torch.Size([1, 36716]) Labels shape: torch.Size([1, 36716]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 28810 Attention mask shape: torch.Size([1, 1, 28810, 28810]) Position ids shape: torch.Size([1, 28810]) Input IDs shape: torch.Size([1, 28810]) Labels shape: torch.Size([1, 28810]) Final batch size: 1, sequence length: 17778 Attention mask shape: torch.Size([1, 1, 17778, 17778]) Position ids shape: torch.Size([1, 17778]) Input IDs shape: torch.Size([1, 17778]) Labels shape: torch.Size([1, 17778]) Final batch size: 1, sequence length: 25388 Attention mask shape: torch.Size([1, 1, 25388, 25388]) Position ids shape: torch.Size([1, 25388]) Input IDs shape: torch.Size([1, 25388]) Labels shape: torch.Size([1, 25388]) Final batch size: 1, sequence length: 14104 Attention mask shape: torch.Size([1, 1, 14104, 14104]) Position ids shape: torch.Size([1, 14104]) Input IDs shape: torch.Size([1, 14104]) Labels shape: torch.Size([1, 14104]) Final batch size: 1, sequence length: 39661 Attention mask shape: torch.Size([1, 1, 39661, 39661]) Position ids shape: torch.Size([1, 39661]) Input IDs shape: torch.Size([1, 39661]) Labels shape: torch.Size([1, 39661]) Final batch size: 1, sequence length: 26930 Attention mask shape: torch.Size([1, 1, 26930, 26930]) Position ids shape: torch.Size([1, 26930]) Input IDs shape: torch.Size([1, 26930]) Labels shape: torch.Size([1, 26930]) Final batch size: 1, sequence length: 30135 Attention mask shape: torch.Size([1, 1, 30135, 30135]) Position ids shape: torch.Size([1, 30135]) Input IDs shape: torch.Size([1, 30135]) Labels shape: torch.Size([1, 30135]) Final batch size: 1, sequence length: 20057 Attention mask shape: torch.Size([1, 1, 20057, 20057]) Position ids shape: torch.Size([1, 20057]) Input IDs shape: torch.Size([1, 20057]) Labels shape: torch.Size([1, 20057]) Final batch size: 1, sequence length: 21071 Attention mask shape: torch.Size([1, 1, 21071, 21071]) Position ids shape: torch.Size([1, 21071]) Input IDs shape: torch.Size([1, 21071]) Labels shape: torch.Size([1, 21071]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 20765 Attention mask shape: torch.Size([1, 1, 20765, 20765]) Position ids shape: torch.Size([1, 20765]) Input IDs shape: torch.Size([1, 20765]) Labels shape: torch.Size([1, 20765]) Final batch size: 1, sequence length: 22945 Attention mask shape: torch.Size([1, 1, 22945, 22945]) Position ids shape: torch.Size([1, 22945]) Input IDs shape: torch.Size([1, 22945]) Labels shape: torch.Size([1, 22945]) Final batch size: 1, sequence length: 17971 Attention mask shape: torch.Size([1, 1, 17971, 17971]) Position ids shape: torch.Size([1, 17971]) Input IDs shape: torch.Size([1, 17971]) Labels shape: torch.Size([1, 17971]) Final batch size: 1, sequence length: 27281 Attention mask shape: torch.Size([1, 1, 27281, 27281]) Position ids shape: torch.Size([1, 27281]) Input IDs shape: torch.Size([1, 27281]) Labels shape: torch.Size([1, 27281]) Final batch size: 1, sequence length: 31381 Attention mask shape: torch.Size([1, 1, 31381, 31381]) Position ids shape: torch.Size([1, 31381]) Input IDs shape: torch.Size([1, 31381]) Labels shape: torch.Size([1, 31381]) Final batch size: 1, sequence length: 21006 Attention mask shape: torch.Size([1, 1, 21006, 21006]) Position ids shape: torch.Size([1, 21006]) Input IDs shape: torch.Size([1, 21006]) Labels shape: torch.Size([1, 21006]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 31420 Attention mask shape: torch.Size([1, 1, 31420, 31420]) Position ids shape: torch.Size([1, 31420]) Input IDs shape: torch.Size([1, 31420]) Labels shape: torch.Size([1, 31420]) Final batch size: 1, sequence length: 26665 Attention mask shape: torch.Size([1, 1, 26665, 26665]) Position ids shape: torch.Size([1, 26665]) Input IDs shape: torch.Size([1, 26665]) Labels shape: torch.Size([1, 26665]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 21805 Attention mask shape: torch.Size([1, 1, 21805, 21805]) Position ids shape: torch.Size([1, 21805]) Input IDs shape: torch.Size([1, 21805]) Labels shape: torch.Size([1, 21805]) Final batch size: 1, sequence length: 13903 Attention mask shape: torch.Size([1, 1, 13903, 13903]) Position ids shape: torch.Size([1, 13903]) Input IDs shape: torch.Size([1, 13903]) Labels shape: torch.Size([1, 13903]) Final batch size: 1, sequence length: 25032 Attention mask shape: torch.Size([1, 1, 25032, 25032]) Position ids shape: torch.Size([1, 25032]) Input IDs shape: torch.Size([1, 25032]) Labels shape: torch.Size([1, 25032]) Final batch size: 1, sequence length: 37397 Attention mask shape: torch.Size([1, 1, 37397, 37397]) Position ids shape: torch.Size([1, 37397]) Input IDs shape: torch.Size([1, 37397]) Labels shape: torch.Size([1, 37397]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 32024 Attention mask shape: torch.Size([1, 1, 32024, 32024]) Position ids shape: torch.Size([1, 32024]) Input IDs shape: torch.Size([1, 32024]) Labels shape: torch.Size([1, 32024]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 23187 Attention mask shape: torch.Size([1, 1, 23187, 23187]) Position ids shape: torch.Size([1, 23187]) Input IDs shape: torch.Size([1, 23187]) Labels shape: torch.Size([1, 23187]) Final batch size: 1, sequence length: 18566 Attention mask shape: torch.Size([1, 1, 18566, 18566]) Position ids shape: torch.Size([1, 18566]) Input IDs shape: torch.Size([1, 18566]) Labels shape: torch.Size([1, 18566]) Final batch size: 1, sequence length: 39778 Attention mask shape: torch.Size([1, 1, 39778, 39778]) Position ids shape: torch.Size([1, 39778]) Input IDs shape: torch.Size([1, 39778]) Labels shape: torch.Size([1, 39778]) Final batch size: 1, sequence length: 35879 Attention mask shape: torch.Size([1, 1, 35879, 35879]) Position ids shape: torch.Size([1, 35879]) Input IDs shape: torch.Size([1, 35879]) Labels shape: torch.Size([1, 35879]) Final batch size: 1, sequence length: 40579 Attention mask shape: torch.Size([1, 1, 40579, 40579]) Position ids shape: torch.Size([1, 40579]) Input IDs shape: torch.Size([1, 40579]) Labels shape: torch.Size([1, 40579]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 36415 Attention mask shape: torch.Size([1, 1, 36415, 36415]) Position ids shape: torch.Size([1, 36415]) Input IDs shape: torch.Size([1, 36415]) Labels shape: torch.Size([1, 36415]) Final batch size: 1, sequence length: 39587 Attention mask shape: torch.Size([1, 1, 39587, 39587]) Position ids shape: torch.Size([1, 39587]) Input IDs shape: torch.Size([1, 39587]) Labels shape: torch.Size([1, 39587]) Final batch size: 1, sequence length: 12200 Attention mask shape: torch.Size([1, 1, 12200, 12200]) Position ids shape: torch.Size([1, 12200]) Input IDs shape: torch.Size([1, 12200]) Labels shape: torch.Size([1, 12200]) Final batch size: 1, sequence length: 37530 Attention mask shape: torch.Size([1, 1, 37530, 37530]) Position ids shape: torch.Size([1, 37530]) Input IDs shape: torch.Size([1, 37530]) Labels shape: torch.Size([1, 37530]) Final batch size: 1, sequence length: 37890 Attention mask shape: torch.Size([1, 1, 37890, 37890]) Position ids shape: torch.Size([1, 37890]) Input IDs shape: torch.Size([1, 37890]) Labels shape: torch.Size([1, 37890]) Final batch size: 1, sequence length: 19668 Attention mask shape: torch.Size([1, 1, 19668, 19668]) Position ids shape: torch.Size([1, 19668]) Input IDs shape: torch.Size([1, 19668]) Labels shape: torch.Size([1, 19668]) Final batch size: 1, sequence length: 32541 Attention mask shape: torch.Size([1, 1, 32541, 32541]) Position ids shape: torch.Size([1, 32541]) Input IDs shape: torch.Size([1, 32541]) Labels shape: torch.Size([1, 32541]) Final batch size: 1, sequence length: 25383 Attention mask shape: torch.Size([1, 1, 25383, 25383]) Position ids shape: torch.Size([1, 25383]) Input IDs shape: torch.Size([1, 25383]) Labels shape: torch.Size([1, 25383]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 39579 Attention mask shape: torch.Size([1, 1, 39579, 39579]) Position ids shape: torch.Size([1, 39579]) Input IDs shape: torch.Size([1, 39579]) Labels shape: torch.Size([1, 39579]) Final batch size: 1, sequence length: 14308 Attention mask shape: torch.Size([1, 1, 14308, 14308]) Position ids shape: torch.Size([1, 14308]) Input IDs shape: torch.Size([1, 14308]) Labels shape: torch.Size([1, 14308]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 9251 Attention mask shape: torch.Size([1, 1, 9251, 9251]) Position ids shape: torch.Size([1, 9251]) Input IDs shape: torch.Size([1, 9251]) Labels shape: torch.Size([1, 9251]) Final batch size: 1, sequence length: 36647 Attention mask shape: torch.Size([1, 1, 36647, 36647]) Position ids shape: torch.Size([1, 36647]) Input IDs shape: torch.Size([1, 36647]) Labels shape: torch.Size([1, 36647]) {'loss': 0.2405, 'grad_norm': 0.28364036338490733, 'learning_rate': 3.4549150281252635e-06, 'num_tokens': -inf, 'epoch': 5.12} Final batch size: 1, sequence length: 4858 Attention mask shape: torch.Size([1, 1, 4858, 4858]) Position ids shape: torch.Size([1, 4858]) Input IDs shape: torch.Size([1, 4858]) Labels shape: torch.Size([1, 4858]) Final batch size: 1, sequence length: 6316 Attention mask shape: torch.Size([1, 1, 6316, 6316]) Position ids shape: torch.Size([1, 6316]) Input IDs shape: torch.Size([1, 6316]) Labels shape: torch.Size([1, 6316]) Final batch size: 1, sequence length: 12846 Attention mask shape: torch.Size([1, 1, 12846, 12846]) Position ids shape: torch.Size([1, 12846]) Input IDs shape: torch.Size([1, 12846]) Labels shape: torch.Size([1, 12846]) Final batch size: 1, sequence length: 12945 Attention mask shape: torch.Size([1, 1, 12945, 12945]) Position ids shape: torch.Size([1, 12945]) Input IDs shape: torch.Size([1, 12945]) Labels shape: torch.Size([1, 12945]) Final batch size: 1, sequence length: 7360 Attention mask shape: torch.Size([1, 1, 7360, 7360]) Position ids shape: torch.Size([1, 7360]) Input IDs shape: torch.Size([1, 7360]) Labels shape: torch.Size([1, 7360]) Final batch size: 1, sequence length: 11448 Final batch size: 1, sequence length: 14330 Attention mask shape: torch.Size([1, 1, 11448, 11448]) Position ids shape: torch.Size([1, 11448]) Input IDs shape: torch.Size([1, 11448]) Attention mask shape: torch.Size([1, 1, 14330, 14330]) Labels shape: torch.Size([1, 11448]) Position ids shape: torch.Size([1, 14330]) Input IDs shape: torch.Size([1, 14330]) Labels shape: torch.Size([1, 14330]) Final batch size: 1, sequence length: 14891 Attention mask shape: torch.Size([1, 1, 14891, 14891]) Position ids shape: torch.Size([1, 14891]) Input IDs shape: torch.Size([1, 14891]) Labels shape: torch.Size([1, 14891]) Final batch size: 1, sequence length: 15189 Attention mask shape: torch.Size([1, 1, 15189, 15189]) Position ids shape: torch.Size([1, 15189]) Input IDs shape: torch.Size([1, 15189]) Labels shape: torch.Size([1, 15189]) Final batch size: 1, sequence length: 12075 Attention mask shape: torch.Size([1, 1, 12075, 12075]) Position ids shape: torch.Size([1, 12075]) Input IDs shape: torch.Size([1, 12075]) Labels shape: torch.Size([1, 12075]) Final batch size: 1, sequence length: 16391 Attention mask shape: torch.Size([1, 1, 16391, 16391]) Position ids shape: torch.Size([1, 16391]) Input IDs shape: torch.Size([1, 16391]) Labels shape: torch.Size([1, 16391]) Final batch size: 1, sequence length: 16658 Attention mask shape: torch.Size([1, 1, 16658, 16658]) Position ids shape: torch.Size([1, 16658]) Input IDs shape: torch.Size([1, 16658]) Labels shape: torch.Size([1, 16658]) Final batch size: 1, sequence length: 16961 Attention mask shape: torch.Size([1, 1, 16961, 16961]) Position ids shape: torch.Size([1, 16961]) Input IDs shape: torch.Size([1, 16961]) Labels shape: torch.Size([1, 16961]) Final batch size: 1, sequence length: 16326 Attention mask shape: torch.Size([1, 1, 16326, 16326]) Position ids shape: torch.Size([1, 16326]) Input IDs shape: torch.Size([1, 16326]) Labels shape: torch.Size([1, 16326]) Final batch size: 1, sequence length: 18341 Attention mask shape: torch.Size([1, 1, 18341, 18341]) Position ids shape: torch.Size([1, 18341]) Input IDs shape: torch.Size([1, 18341]) Labels shape: torch.Size([1, 18341]) Final batch size: 1, sequence length: 18400 Attention mask shape: torch.Size([1, 1, 18400, 18400]) Position ids shape: torch.Size([1, 18400]) Input IDs shape: torch.Size([1, 18400]) Labels shape: torch.Size([1, 18400]) Final batch size: 1, sequence length: 19332 Attention mask shape: torch.Size([1, 1, 19332, 19332]) Position ids shape: torch.Size([1, 19332]) Input IDs shape: torch.Size([1, 19332]) Labels shape: torch.Size([1, 19332]) Final batch size: 1, sequence length: 18014 Attention mask shape: torch.Size([1, 1, 18014, 18014]) Position ids shape: torch.Size([1, 18014]) Input IDs shape: torch.Size([1, 18014]) Labels shape: torch.Size([1, 18014]) Final batch size: 1, sequence length: 19597 Attention mask shape: torch.Size([1, 1, 19597, 19597]) Position ids shape: torch.Size([1, 19597]) Input IDs shape: torch.Size([1, 19597]) Labels shape: torch.Size([1, 19597]) Final batch size: 1, sequence length: 17246 Attention mask shape: torch.Size([1, 1, 17246, 17246]) Position ids shape: torch.Size([1, 17246]) Input IDs shape: torch.Size([1, 17246]) Labels shape: torch.Size([1, 17246]) Final batch size: 1, sequence length: 18424 Attention mask shape: torch.Size([1, 1, 18424, 18424]) Position ids shape: torch.Size([1, 18424]) Input IDs shape: torch.Size([1, 18424]) Labels shape: torch.Size([1, 18424]) Final batch size: 1, sequence length: 21122 Attention mask shape: torch.Size([1, 1, 21122, 21122]) Position ids shape: torch.Size([1, 21122]) Input IDs shape: torch.Size([1, 21122]) Labels shape: torch.Size([1, 21122]) Final batch size: 1, sequence length: 16536 Attention mask shape: torch.Size([1, 1, 16536, 16536]) Position ids shape: torch.Size([1, 16536]) Input IDs shape: torch.Size([1, 16536]) Labels shape: torch.Size([1, 16536]) Final batch size: 1, sequence length: 19999 Final batch size: 1, sequence length: 21617 Attention mask shape: torch.Size([1, 1, 19999, 19999]) Position ids shape: torch.Size([1, 19999]) Input IDs shape: torch.Size([1, 19999]) Labels shape: torch.Size([1, 19999]) Attention mask shape: torch.Size([1, 1, 21617, 21617]) Position ids shape: torch.Size([1, 21617]) Input IDs shape: torch.Size([1, 21617]) Labels shape: torch.Size([1, 21617]) Final batch size: 1, sequence length: 20118 Attention mask shape: torch.Size([1, 1, 20118, 20118]) Position ids shape: torch.Size([1, 20118]) Input IDs shape: torch.Size([1, 20118]) Labels shape: torch.Size([1, 20118]) Final batch size: 1, sequence length: 20912 Attention mask shape: torch.Size([1, 1, 20912, 20912]) Position ids shape: torch.Size([1, 20912]) Input IDs shape: torch.Size([1, 20912]) Labels shape: torch.Size([1, 20912]) Final batch size: 1, sequence length: 18393 Attention mask shape: torch.Size([1, 1, 18393, 18393]) Position ids shape: torch.Size([1, 18393]) Input IDs shape: torch.Size([1, 18393]) Labels shape: torch.Size([1, 18393]) Final batch size: 1, sequence length: 24597 Attention mask shape: torch.Size([1, 1, 24597, 24597]) Position ids shape: torch.Size([1, 24597]) Input IDs shape: torch.Size([1, 24597]) Labels shape: torch.Size([1, 24597]) Final batch size: 1, sequence length: 9341 Attention mask shape: torch.Size([1, 1, 9341, 9341]) Position ids shape: torch.Size([1, 9341]) Input IDs shape: torch.Size([1, 9341]) Labels shape: torch.Size([1, 9341]) Final batch size: 1, sequence length: 16831 Attention mask shape: torch.Size([1, 1, 16831, 16831]) Position ids shape: torch.Size([1, 16831]) Input IDs shape: torch.Size([1, 16831]) Labels shape: torch.Size([1, 16831]) Final batch size: 1, sequence length: 20888 Attention mask shape: torch.Size([1, 1, 20888, 20888]) Position ids shape: torch.Size([1, 20888]) Input IDs shape: torch.Size([1, 20888]) Labels shape: torch.Size([1, 20888]) Final batch size: 1, sequence length: 21314 Attention mask shape: torch.Size([1, 1, 21314, 21314]) Position ids shape: torch.Size([1, 21314]) Input IDs shape: torch.Size([1, 21314]) Labels shape: torch.Size([1, 21314]) Final batch size: 1, sequence length: 22854 Attention mask shape: torch.Size([1, 1, 22854, 22854]) Position ids shape: torch.Size([1, 22854]) Input IDs shape: torch.Size([1, 22854]) Labels shape: torch.Size([1, 22854]) Final batch size: 1, sequence length: 21611 Attention mask shape: torch.Size([1, 1, 21611, 21611]) Position ids shape: torch.Size([1, 21611]) Input IDs shape: torch.Size([1, 21611]) Labels shape: torch.Size([1, 21611]) Final batch size: 1, sequence length: 6839 Attention mask shape: torch.Size([1, 1, 6839, 6839]) Position ids shape: torch.Size([1, 6839]) Input IDs shape: torch.Size([1, 6839]) Labels shape: torch.Size([1, 6839]) Final batch size: 1, sequence length: 13638 Attention mask shape: torch.Size([1, 1, 13638, 13638]) Position ids shape: torch.Size([1, 13638]) Input IDs shape: torch.Size([1, 13638]) Labels shape: torch.Size([1, 13638]) Final batch size: 1, sequence length: 25575 Attention mask shape: torch.Size([1, 1, 25575, 25575]) Position ids shape: torch.Size([1, 25575]) Input IDs shape: torch.Size([1, 25575]) Labels shape: torch.Size([1, 25575]) Final batch size: 1, sequence length: 25909 Attention mask shape: torch.Size([1, 1, 25909, 25909]) Position ids shape: torch.Size([1, 25909]) Input IDs shape: torch.Size([1, 25909]) Labels shape: torch.Size([1, 25909]) Final batch size: 1, sequence length: 12328 Attention mask shape: torch.Size([1, 1, 12328, 12328]) Position ids shape: torch.Size([1, 12328]) Input IDs shape: torch.Size([1, 12328]) Labels shape: torch.Size([1, 12328]) Final batch size: 1, sequence length: 15026 Attention mask shape: torch.Size([1, 1, 15026, 15026]) Position ids shape: torch.Size([1, 15026]) Input IDs shape: torch.Size([1, 15026]) Labels shape: torch.Size([1, 15026]) Final batch size: 1, sequence length: 15232 Attention mask shape: torch.Size([1, 1, 15232, 15232]) Position ids shape: torch.Size([1, 15232]) Input IDs shape: torch.Size([1, 15232]) Labels shape: torch.Size([1, 15232]) Final batch size: 1, sequence length: 21739 Attention mask shape: torch.Size([1, 1, 21739, 21739]) Position ids shape: torch.Size([1, 21739]) Input IDs shape: torch.Size([1, 21739]) Labels shape: torch.Size([1, 21739]) Final batch size: 1, sequence length: 25042 Attention mask shape: torch.Size([1, 1, 25042, 25042]) Position ids shape: torch.Size([1, 25042]) Input IDs shape: torch.Size([1, 25042]) Labels shape: torch.Size([1, 25042]) Final batch size: 1, sequence length: 20702 Attention mask shape: torch.Size([1, 1, 20702, 20702]) Position ids shape: torch.Size([1, 20702]) Input IDs shape: torch.Size([1, 20702]) Labels shape: torch.Size([1, 20702]) Final batch size: 1, sequence length: 20619 Attention mask shape: torch.Size([1, 1, 20619, 20619]) Position ids shape: torch.Size([1, 20619]) Input IDs shape: torch.Size([1, 20619]) Labels shape: torch.Size([1, 20619]) Final batch size: 1, sequence length: 16611 Attention mask shape: torch.Size([1, 1, 16611, 16611]) Position ids shape: torch.Size([1, 16611]) Input IDs shape: torch.Size([1, 16611]) Labels shape: torch.Size([1, 16611]) Final batch size: 1, sequence length: 13646 Attention mask shape: torch.Size([1, 1, 13646, 13646]) Position ids shape: torch.Size([1, 13646]) Input IDs shape: torch.Size([1, 13646]) Labels shape: torch.Size([1, 13646]) Final batch size: 1, sequence length: 29730 Attention mask shape: torch.Size([1, 1, 29730, 29730]) Position ids shape: torch.Size([1, 29730]) Input IDs shape: torch.Size([1, 29730]) Labels shape: torch.Size([1, 29730]) Final batch size: 1, sequence length: 21725 Attention mask shape: torch.Size([1, 1, 21725, 21725]) Position ids shape: torch.Size([1, 21725]) Input IDs shape: torch.Size([1, 21725]) Labels shape: torch.Size([1, 21725]) Final batch size: 1, sequence length: 27795 Attention mask shape: torch.Size([1, 1, 27795, 27795]) Position ids shape: torch.Size([1, 27795]) Input IDs shape: torch.Size([1, 27795]) Labels shape: torch.Size([1, 27795]) Final batch size: 1, sequence length: 23614 Attention mask shape: torch.Size([1, 1, 23614, 23614]) Position ids shape: torch.Size([1, 23614]) Input IDs shape: torch.Size([1, 23614]) Labels shape: torch.Size([1, 23614]) Final batch size: 1, sequence length: 28348 Attention mask shape: torch.Size([1, 1, 28348, 28348]) Position ids shape: torch.Size([1, 28348]) Input IDs shape: torch.Size([1, 28348]) Labels shape: torch.Size([1, 28348]) Final batch size: 1, sequence length: 19885 Attention mask shape: torch.Size([1, 1, 19885, 19885]) Position ids shape: torch.Size([1, 19885]) Input IDs shape: torch.Size([1, 19885]) Labels shape: torch.Size([1, 19885]) Final batch size: 1, sequence length: 21677 Attention mask shape: torch.Size([1, 1, 21677, 21677]) Position ids shape: torch.Size([1, 21677]) Input IDs shape: torch.Size([1, 21677]) Labels shape: torch.Size([1, 21677]) Final batch size: 1, sequence length: 26218 Attention mask shape: torch.Size([1, 1, 26218, 26218]) Position ids shape: torch.Size([1, 26218]) Input IDs shape: torch.Size([1, 26218]) Labels shape: torch.Size([1, 26218]) Final batch size: 1, sequence length: 28678 Attention mask shape: torch.Size([1, 1, 28678, 28678]) Position ids shape: torch.Size([1, 28678]) Input IDs shape: torch.Size([1, 28678]) Labels shape: torch.Size([1, 28678]) Final batch size: 1, sequence length: 17780 Attention mask shape: torch.Size([1, 1, 17780, 17780]) Position ids shape: torch.Size([1, 17780]) Input IDs shape: torch.Size([1, 17780]) Labels shape: torch.Size([1, 17780]) Final batch size: 1, sequence length: 24342 Attention mask shape: torch.Size([1, 1, 24342, 24342]) Position ids shape: torch.Size([1, 24342]) Input IDs shape: torch.Size([1, 24342]) Labels shape: torch.Size([1, 24342]) Final batch size: 1, sequence length: 30410 Attention mask shape: torch.Size([1, 1, 30410, 30410]) Position ids shape: torch.Size([1, 30410]) Input IDs shape: torch.Size([1, 30410]) Labels shape: torch.Size([1, 30410]) Final batch size: 1, sequence length: 24090 Attention mask shape: torch.Size([1, 1, 24090, 24090]) Position ids shape: torch.Size([1, 24090]) Input IDs shape: torch.Size([1, 24090]) Labels shape: torch.Size([1, 24090]) Final batch size: 1, sequence length: 22217 Attention mask shape: torch.Size([1, 1, 22217, 22217]) Position ids shape: torch.Size([1, 22217]) Input IDs shape: torch.Size([1, 22217]) Labels shape: torch.Size([1, 22217]) Final batch size: 1, sequence length: 21623 Attention mask shape: torch.Size([1, 1, 21623, 21623]) Position ids shape: torch.Size([1, 21623]) Input IDs shape: torch.Size([1, 21623]) Labels shape: torch.Size([1, 21623]) Final batch size: 1, sequence length: 32034 Attention mask shape: torch.Size([1, 1, 32034, 32034]) Position ids shape: torch.Size([1, 32034]) Input IDs shape: torch.Size([1, 32034]) Labels shape: torch.Size([1, 32034]) Final batch size: 1, sequence length: 30766 Attention mask shape: torch.Size([1, 1, 30766, 30766]) Position ids shape: torch.Size([1, 30766]) Input IDs shape: torch.Size([1, 30766]) Labels shape: torch.Size([1, 30766]) Final batch size: 1, sequence length: 20915 Attention mask shape: torch.Size([1, 1, 20915, 20915]) Position ids shape: torch.Size([1, 20915]) Input IDs shape: torch.Size([1, 20915]) Labels shape: torch.Size([1, 20915]) Final batch size: 1, sequence length: 16403 Attention mask shape: torch.Size([1, 1, 16403, 16403]) Position ids shape: torch.Size([1, 16403]) Input IDs shape: torch.Size([1, 16403]) Labels shape: torch.Size([1, 16403]) Final batch size: 1, sequence length: 30917 Attention mask shape: torch.Size([1, 1, 30917, 30917]) Position ids shape: torch.Size([1, 30917]) Input IDs shape: torch.Size([1, 30917]) Labels shape: torch.Size([1, 30917]) Final batch size: 1, sequence length: 35781 Attention mask shape: torch.Size([1, 1, 35781, 35781]) Position ids shape: torch.Size([1, 35781]) Input IDs shape: torch.Size([1, 35781]) Labels shape: torch.Size([1, 35781]) Final batch size: 1, sequence length: 20028 Attention mask shape: torch.Size([1, 1, 20028, 20028]) Position ids shape: torch.Size([1, 20028]) Input IDs shape: torch.Size([1, 20028]) Labels shape: torch.Size([1, 20028]) Final batch size: 1, sequence length: 34802 Attention mask shape: torch.Size([1, 1, 34802, 34802]) Position ids shape: torch.Size([1, 34802]) Input IDs shape: torch.Size([1, 34802]) Labels shape: torch.Size([1, 34802]) Final batch size: 1, sequence length: 23626 Attention mask shape: torch.Size([1, 1, 23626, 23626]) Position ids shape: torch.Size([1, 23626]) Input IDs shape: torch.Size([1, 23626]) Labels shape: torch.Size([1, 23626]) Final batch size: 1, sequence length: 25698 Attention mask shape: torch.Size([1, 1, 25698, 25698]) Position ids shape: torch.Size([1, 25698]) Input IDs shape: torch.Size([1, 25698]) Labels shape: torch.Size([1, 25698]) Final batch size: 1, sequence length: 35697 Attention mask shape: torch.Size([1, 1, 35697, 35697]) Position ids shape: torch.Size([1, 35697]) Input IDs shape: torch.Size([1, 35697]) Labels shape: torch.Size([1, 35697]) Final batch size: 1, sequence length: 35628 Attention mask shape: torch.Size([1, 1, 35628, 35628]) Position ids shape: torch.Size([1, 35628]) Input IDs shape: torch.Size([1, 35628]) Labels shape: torch.Size([1, 35628]) Final batch size: 1, sequence length: 30366 Attention mask shape: torch.Size([1, 1, 30366, 30366]) Position ids shape: torch.Size([1, 30366]) Input IDs shape: torch.Size([1, 30366]) Labels shape: torch.Size([1, 30366]) Final batch size: 1, sequence length: 14070 Attention mask shape: torch.Size([1, 1, 14070, 14070]) Position ids shape: torch.Size([1, 14070]) Input IDs shape: torch.Size([1, 14070]) Labels shape: torch.Size([1, 14070]) Final batch size: 1, sequence length: 30796 Attention mask shape: torch.Size([1, 1, 30796, 30796]) Position ids shape: torch.Size([1, 30796]) Input IDs shape: torch.Size([1, 30796]) Labels shape: torch.Size([1, 30796]) Final batch size: 1, sequence length: 22768 Attention mask shape: torch.Size([1, 1, 22768, 22768]) Position ids shape: torch.Size([1, 22768]) Input IDs shape: torch.Size([1, 22768]) Labels shape: torch.Size([1, 22768]) Final batch size: 1, sequence length: 21562 Attention mask shape: torch.Size([1, 1, 21562, 21562]) Position ids shape: torch.Size([1, 21562]) Input IDs shape: torch.Size([1, 21562]) Labels shape: torch.Size([1, 21562]) Final batch size: 1, sequence length: 37195 Attention mask shape: torch.Size([1, 1, 37195, 37195]) Position ids shape: torch.Size([1, 37195]) Input IDs shape: torch.Size([1, 37195]) Labels shape: torch.Size([1, 37195]) Final batch size: 1, sequence length: 32239 Attention mask shape: torch.Size([1, 1, 32239, 32239]) Position ids shape: torch.Size([1, 32239]) Input IDs shape: torch.Size([1, 32239]) Labels shape: torch.Size([1, 32239]) Final batch size: 1, sequence length: 38190 Attention mask shape: torch.Size([1, 1, 38190, 38190]) Position ids shape: torch.Size([1, 38190]) Input IDs shape: torch.Size([1, 38190]) Labels shape: torch.Size([1, 38190]) Final batch size: 1, sequence length: 32308 Attention mask shape: torch.Size([1, 1, 32308, 32308]) Position ids shape: torch.Size([1, 32308]) Input IDs shape: torch.Size([1, 32308]) Labels shape: torch.Size([1, 32308]) Final batch size: 1, sequence length: 31143 Attention mask shape: torch.Size([1, 1, 31143, 31143]) Position ids shape: torch.Size([1, 31143]) Input IDs shape: torch.Size([1, 31143]) Labels shape: torch.Size([1, 31143]) Final batch size: 1, sequence length: 15726 Attention mask shape: torch.Size([1, 1, 15726, 15726]) Position ids shape: torch.Size([1, 15726]) Input IDs shape: torch.Size([1, 15726]) Labels shape: torch.Size([1, 15726]) Final batch size: 1, sequence length: 38362 Attention mask shape: torch.Size([1, 1, 38362, 38362]) Position ids shape: torch.Size([1, 38362]) Input IDs shape: torch.Size([1, 38362]) Labels shape: torch.Size([1, 38362]) Final batch size: 1, sequence length: 38420 Attention mask shape: torch.Size([1, 1, 38420, 38420]) Position ids shape: torch.Size([1, 38420]) Input IDs shape: torch.Size([1, 38420]) Labels shape: torch.Size([1, 38420]) Final batch size: 1, sequence length: 39608 Attention mask shape: torch.Size([1, 1, 39608, 39608]) Final batch size: 1, sequence length: 32630 Position ids shape: torch.Size([1, 39608]) Input IDs shape: torch.Size([1, 39608]) Labels shape: torch.Size([1, 39608]) Attention mask shape: torch.Size([1, 1, 32630, 32630]) Position ids shape: torch.Size([1, 32630]) Input IDs shape: torch.Size([1, 32630]) Labels shape: torch.Size([1, 32630]) Final batch size: 1, sequence length: 14828 Attention mask shape: torch.Size([1, 1, 14828, 14828]) Position ids shape: torch.Size([1, 14828]) Input IDs shape: torch.Size([1, 14828]) Labels shape: torch.Size([1, 14828]) Final batch size: 1, sequence length: 22199 Attention mask shape: torch.Size([1, 1, 22199, 22199]) Position ids shape: torch.Size([1, 22199]) Input IDs shape: torch.Size([1, 22199]) Labels shape: torch.Size([1, 22199]) Final batch size: 1, sequence length: 40088 Attention mask shape: torch.Size([1, 1, 40088, 40088]) Position ids shape: torch.Size([1, 40088]) Input IDs shape: torch.Size([1, 40088]) Labels shape: torch.Size([1, 40088]) Final batch size: 1, sequence length: 11896 Attention mask shape: torch.Size([1, 1, 11896, 11896]) Position ids shape: torch.Size([1, 11896]) Input IDs shape: torch.Size([1, 11896]) Labels shape: torch.Size([1, 11896]) Final batch size: 1, sequence length: 10426 Attention mask shape: torch.Size([1, 1, 10426, 10426]) Position ids shape: torch.Size([1, 10426]) Input IDs shape: torch.Size([1, 10426]) Labels shape: torch.Size([1, 10426]) Final batch size: 1, sequence length: 18568 Attention mask shape: torch.Size([1, 1, 18568, 18568]) Position ids shape: torch.Size([1, 18568]) Input IDs shape: torch.Size([1, 18568]) Labels shape: torch.Size([1, 18568]) Final batch size: 1, sequence length: 26840 Attention mask shape: torch.Size([1, 1, 26840, 26840]) Position ids shape: torch.Size([1, 26840]) Input IDs shape: torch.Size([1, 26840]) Labels shape: torch.Size([1, 26840]) Final batch size: 1, sequence length: 34785 Attention mask shape: torch.Size([1, 1, 34785, 34785]) Position ids shape: torch.Size([1, 34785]) Input IDs shape: torch.Size([1, 34785]) Labels shape: torch.Size([1, 34785]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 26287 Attention mask shape: torch.Size([1, 1, 26287, 26287]) Position ids shape: torch.Size([1, 26287]) Input IDs shape: torch.Size([1, 26287]) Labels shape: torch.Size([1, 26287]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 30651 Attention mask shape: torch.Size([1, 1, 30651, 30651]) Position ids shape: torch.Size([1, 30651]) Input IDs shape: torch.Size([1, 30651]) Labels shape: torch.Size([1, 30651]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 37529 Attention mask shape: torch.Size([1, 1, 37529, 37529]) Position ids shape: torch.Size([1, 37529]) Input IDs shape: torch.Size([1, 37529]) Labels shape: torch.Size([1, 37529]) Final batch size: 1, sequence length: 29202 Attention mask shape: torch.Size([1, 1, 29202, 29202]) Position ids shape: torch.Size([1, 29202]) Input IDs shape: torch.Size([1, 29202]) Labels shape: torch.Size([1, 29202]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 16304 Attention mask shape: torch.Size([1, 1, 16304, 16304]) Position ids shape: torch.Size([1, 16304]) Input IDs shape: torch.Size([1, 16304]) Labels shape: torch.Size([1, 16304]) Final batch size: 1, sequence length: 18859 Attention mask shape: torch.Size([1, 1, 18859, 18859]) Position ids shape: torch.Size([1, 18859]) Input IDs shape: torch.Size([1, 18859]) Labels shape: torch.Size([1, 18859]) Final batch size: 1, sequence length: 35914 Attention mask shape: torch.Size([1, 1, 35914, 35914]) Position ids shape: torch.Size([1, 35914]) Input IDs shape: torch.Size([1, 35914]) Labels shape: torch.Size([1, 35914]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 10198 Attention mask shape: torch.Size([1, 1, 10198, 10198]) Position ids shape: torch.Size([1, 10198]) Input IDs shape: torch.Size([1, 10198]) Labels shape: torch.Size([1, 10198]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 23936 Attention mask shape: torch.Size([1, 1, 23936, 23936]) Position ids shape: torch.Size([1, 23936]) Input IDs shape: torch.Size([1, 23936]) Labels shape: torch.Size([1, 23936]) Final batch size: 1, sequence length: 32559 Attention mask shape: torch.Size([1, 1, 32559, 32559]) Position ids shape: torch.Size([1, 32559]) Input IDs shape: torch.Size([1, 32559]) Labels shape: torch.Size([1, 32559]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 27819 Attention mask shape: torch.Size([1, 1, 27819, 27819]) Position ids shape: torch.Size([1, 27819]) Input IDs shape: torch.Size([1, 27819]) Labels shape: torch.Size([1, 27819]) Final batch size: 1, sequence length: 36638 Attention mask shape: torch.Size([1, 1, 36638, 36638]) Position ids shape: torch.Size([1, 36638]) Input IDs shape: torch.Size([1, 36638]) Labels shape: torch.Size([1, 36638]) Final batch size: 1, sequence length: 31316 Attention mask shape: torch.Size([1, 1, 31316, 31316]) Position ids shape: torch.Size([1, 31316]) Input IDs shape: torch.Size([1, 31316]) Labels shape: torch.Size([1, 31316]) Final batch size: 1, sequence length: 19494 Attention mask shape: torch.Size([1, 1, 19494, 19494]) Position ids shape: torch.Size([1, 19494]) Input IDs shape: torch.Size([1, 19494]) Labels shape: torch.Size([1, 19494]) Final batch size: 1, sequence length: 12605 Attention mask shape: torch.Size([1, 1, 12605, 12605]) Position ids shape: torch.Size([1, 12605]) Input IDs shape: torch.Size([1, 12605]) Labels shape: torch.Size([1, 12605]) Final batch size: 1, sequence length: 30835 Attention mask shape: torch.Size([1, 1, 30835, 30835]) Position ids shape: torch.Size([1, 30835]) Input IDs shape: torch.Size([1, 30835]) Labels shape: torch.Size([1, 30835]) Final batch size: 1, sequence length: 26269 Attention mask shape: torch.Size([1, 1, 26269, 26269]) Position ids shape: torch.Size([1, 26269]) Input IDs shape: torch.Size([1, 26269]) Labels shape: torch.Size([1, 26269]) Final batch size: 1, sequence length: 31754 Attention mask shape: torch.Size([1, 1, 31754, 31754]) Position ids shape: torch.Size([1, 31754]) Input IDs shape: torch.Size([1, 31754]) Labels shape: torch.Size([1, 31754]) Final batch size: 1, sequence length: 25573 Attention mask shape: torch.Size([1, 1, 25573, 25573]) Position ids shape: torch.Size([1, 25573]) Input IDs shape: torch.Size([1, 25573]) Labels shape: torch.Size([1, 25573]) Final batch size: 1, sequence length: 40763 Attention mask shape: torch.Size([1, 1, 40763, 40763]) Position ids shape: torch.Size([1, 40763]) Input IDs shape: torch.Size([1, 40763]) Labels shape: torch.Size([1, 40763]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 37170 Attention mask shape: torch.Size([1, 1, 37170, 37170]) Position ids shape: torch.Size([1, 37170]) Input IDs shape: torch.Size([1, 37170]) Labels shape: torch.Size([1, 37170]) {'loss': 0.2682, 'grad_norm': 0.29202284224600045, 'learning_rate': 3.2081602522734987e-06, 'num_tokens': -inf, 'epoch': 5.25} Final batch size: 1, sequence length: 7998 Attention mask shape: torch.Size([1, 1, 7998, 7998]) Position ids shape: torch.Size([1, 7998]) Input IDs shape: torch.Size([1, 7998]) Labels shape: torch.Size([1, 7998]) Final batch size: 1, sequence length: 6925 Attention mask shape: torch.Size([1, 1, 6925, 6925]) Position ids shape: torch.Size([1, 6925]) Input IDs shape: torch.Size([1, 6925]) Labels shape: torch.Size([1, 6925]) Final batch size: 1, sequence length: 10102 Attention mask shape: torch.Size([1, 1, 10102, 10102]) Position ids shape: torch.Size([1, 10102]) Input IDs shape: torch.Size([1, 10102]) Labels shape: torch.Size([1, 10102]) Final batch size: 1, sequence length: 9243 Attention mask shape: torch.Size([1, 1, 9243, 9243]) Position ids shape: torch.Size([1, 9243]) Input IDs shape: torch.Size([1, 9243]) Labels shape: torch.Size([1, 9243]) Final batch size: 1, sequence length: 9452 Attention mask shape: torch.Size([1, 1, 9452, 9452]) Position ids shape: torch.Size([1, 9452]) Input IDs shape: torch.Size([1, 9452]) Labels shape: torch.Size([1, 9452]) Final batch size: 1, sequence length: 7402 Attention mask shape: torch.Size([1, 1, 7402, 7402]) Position ids shape: torch.Size([1, 7402]) Input IDs shape: torch.Size([1, 7402]) Labels shape: torch.Size([1, 7402]) Final batch size: 1, sequence length: 10862 Attention mask shape: torch.Size([1, 1, 10862, 10862]) Position ids shape: torch.Size([1, 10862]) Input IDs shape: torch.Size([1, 10862]) Labels shape: torch.Size([1, 10862]) Final batch size: 1, sequence length: 11947 Attention mask shape: torch.Size([1, 1, 11947, 11947]) Position ids shape: torch.Size([1, 11947]) Input IDs shape: torch.Size([1, 11947]) Labels shape: torch.Size([1, 11947]) Final batch size: 1, sequence length: 12719 Attention mask shape: torch.Size([1, 1, 12719, 12719]) Position ids shape: torch.Size([1, 12719]) Input IDs shape: torch.Size([1, 12719]) Labels shape: torch.Size([1, 12719]) Final batch size: 1, sequence length: 10804 Attention mask shape: torch.Size([1, 1, 10804, 10804]) Position ids shape: torch.Size([1, 10804]) Input IDs shape: torch.Size([1, 10804]) Labels shape: torch.Size([1, 10804]) Final batch size: 1, sequence length: 11623 Attention mask shape: torch.Size([1, 1, 11623, 11623]) Position ids shape: torch.Size([1, 11623]) Input IDs shape: torch.Size([1, 11623]) Labels shape: torch.Size([1, 11623]) Final batch size: 1, sequence length: 8432 Attention mask shape: torch.Size([1, 1, 8432, 8432]) Position ids shape: torch.Size([1, 8432]) Input IDs shape: torch.Size([1, 8432]) Labels shape: torch.Size([1, 8432]) Final batch size: 1, sequence length: 15794 Attention mask shape: torch.Size([1, 1, 15794, 15794]) Position ids shape: torch.Size([1, 15794]) Input IDs shape: torch.Size([1, 15794]) Labels shape: torch.Size([1, 15794]) Final batch size: 1, sequence length: 13209 Attention mask shape: torch.Size([1, 1, 13209, 13209]) Position ids shape: torch.Size([1, 13209]) Input IDs shape: torch.Size([1, 13209]) Labels shape: torch.Size([1, 13209]) Final batch size: 1, sequence length: 15053 Attention mask shape: torch.Size([1, 1, 15053, 15053]) Position ids shape: torch.Size([1, 15053]) Input IDs shape: torch.Size([1, 15053]) Labels shape: torch.Size([1, 15053]) Final batch size: 1, sequence length: 15886 Attention mask shape: torch.Size([1, 1, 15886, 15886]) Position ids shape: torch.Size([1, 15886]) Input IDs shape: torch.Size([1, 15886]) Labels shape: torch.Size([1, 15886]) Final batch size: 1, sequence length: 17370 Attention mask shape: torch.Size([1, 1, 17370, 17370]) Position ids shape: torch.Size([1, 17370]) Input IDs shape: torch.Size([1, 17370]) Labels shape: torch.Size([1, 17370]) Final batch size: 1, sequence length: 17376 Attention mask shape: torch.Size([1, 1, 17376, 17376]) Position ids shape: torch.Size([1, 17376]) Input IDs shape: torch.Size([1, 17376]) Labels shape: torch.Size([1, 17376]) Final batch size: 1, sequence length: 17918 Attention mask shape: torch.Size([1, 1, 17918, 17918]) Position ids shape: torch.Size([1, 17918]) Input IDs shape: torch.Size([1, 17918]) Labels shape: torch.Size([1, 17918]) Final batch size: 1, sequence length: 16060 Attention mask shape: torch.Size([1, 1, 16060, 16060]) Position ids shape: torch.Size([1, 16060]) Input IDs shape: torch.Size([1, 16060]) Labels shape: torch.Size([1, 16060]) Final batch size: 1, sequence length: 20185 Attention mask shape: torch.Size([1, 1, 20185, 20185]) Position ids shape: torch.Size([1, 20185]) Input IDs shape: torch.Size([1, 20185]) Labels shape: torch.Size([1, 20185]) Final batch size: 1, sequence length: 20056 Attention mask shape: torch.Size([1, 1, 20056, 20056]) Position ids shape: torch.Size([1, 20056]) Input IDs shape: torch.Size([1, 20056]) Labels shape: torch.Size([1, 20056]) Final batch size: 1, sequence length: 17767 Attention mask shape: torch.Size([1, 1, 17767, 17767]) Position ids shape: torch.Size([1, 17767]) Input IDs shape: torch.Size([1, 17767]) Labels shape: torch.Size([1, 17767]) Final batch size: 1, sequence length: 17665 Attention mask shape: torch.Size([1, 1, 17665, 17665]) Position ids shape: torch.Size([1, 17665]) Input IDs shape: torch.Size([1, 17665]) Labels shape: torch.Size([1, 17665]) Final batch size: 1, sequence length: 19629 Attention mask shape: torch.Size([1, 1, 19629, 19629]) Position ids shape: torch.Size([1, 19629]) Input IDs shape: torch.Size([1, 19629]) Labels shape: torch.Size([1, 19629]) Final batch size: 1, sequence length: 19935 Final batch size: 1, sequence length: 21117 Attention mask shape: torch.Size([1, 1, 19935, 19935]) Position ids shape: torch.Size([1, 19935]) Attention mask shape: torch.Size([1, 1, 21117, 21117]) Input IDs shape: torch.Size([1, 19935]) Labels shape: torch.Size([1, 19935]) Position ids shape: torch.Size([1, 21117]) Input IDs shape: torch.Size([1, 21117]) Labels shape: torch.Size([1, 21117]) Final batch size: 1, sequence length: 21980 Attention mask shape: torch.Size([1, 1, 21980, 21980]) Position ids shape: torch.Size([1, 21980]) Input IDs shape: torch.Size([1, 21980]) Labels shape: torch.Size([1, 21980]) Final batch size: 1, sequence length: 20554 Attention mask shape: torch.Size([1, 1, 20554, 20554]) Position ids shape: torch.Size([1, 20554]) Input IDs shape: torch.Size([1, 20554]) Labels shape: torch.Size([1, 20554]) Final batch size: 1, sequence length: 12006 Attention mask shape: torch.Size([1, 1, 12006, 12006]) Position ids shape: torch.Size([1, 12006]) Input IDs shape: torch.Size([1, 12006]) Labels shape: torch.Size([1, 12006]) Final batch size: 1, sequence length: 16536 Attention mask shape: torch.Size([1, 1, 16536, 16536]) Position ids shape: torch.Size([1, 16536]) Input IDs shape: torch.Size([1, 16536]) Labels shape: torch.Size([1, 16536]) Final batch size: 1, sequence length: 19609 Attention mask shape: torch.Size([1, 1, 19609, 19609]) Position ids shape: torch.Size([1, 19609]) Input IDs shape: torch.Size([1, 19609]) Labels shape: torch.Size([1, 19609]) Final batch size: 1, sequence length: 22208 Attention mask shape: torch.Size([1, 1, 22208, 22208]) Position ids shape: torch.Size([1, 22208]) Input IDs shape: torch.Size([1, 22208]) Labels shape: torch.Size([1, 22208]) Final batch size: 1, sequence length: 21771 Attention mask shape: torch.Size([1, 1, 21771, 21771]) Position ids shape: torch.Size([1, 21771]) Input IDs shape: torch.Size([1, 21771]) Labels shape: torch.Size([1, 21771]) Final batch size: 1, sequence length: 8498 Attention mask shape: torch.Size([1, 1, 8498, 8498]) Position ids shape: torch.Size([1, 8498]) Input IDs shape: torch.Size([1, 8498]) Labels shape: torch.Size([1, 8498]) Final batch size: 1, sequence length: 21421 Attention mask shape: torch.Size([1, 1, 21421, 21421]) Position ids shape: torch.Size([1, 21421]) Input IDs shape: torch.Size([1, 21421]) Labels shape: torch.Size([1, 21421]) Final batch size: 1, sequence length: 16439 Attention mask shape: torch.Size([1, 1, 16439, 16439]) Position ids shape: torch.Size([1, 16439]) Input IDs shape: torch.Size([1, 16439]) Labels shape: torch.Size([1, 16439]) Final batch size: 1, sequence length: 16141 Attention mask shape: torch.Size([1, 1, 16141, 16141]) Position ids shape: torch.Size([1, 16141]) Input IDs shape: torch.Size([1, 16141]) Labels shape: torch.Size([1, 16141]) Final batch size: 1, sequence length: 17625 Attention mask shape: torch.Size([1, 1, 17625, 17625]) Position ids shape: torch.Size([1, 17625]) Input IDs shape: torch.Size([1, 17625]) Labels shape: torch.Size([1, 17625]) Final batch size: 1, sequence length: 22797 Attention mask shape: torch.Size([1, 1, 22797, 22797]) Position ids shape: torch.Size([1, 22797]) Input IDs shape: torch.Size([1, 22797]) Labels shape: torch.Size([1, 22797]) Final batch size: 1, sequence length: 20524 Attention mask shape: torch.Size([1, 1, 20524, 20524]) Position ids shape: torch.Size([1, 20524]) Input IDs shape: torch.Size([1, 20524]) Labels shape: torch.Size([1, 20524]) Final batch size: 1, sequence length: 26142 Attention mask shape: torch.Size([1, 1, 26142, 26142]) Position ids shape: torch.Size([1, 26142]) Input IDs shape: torch.Size([1, 26142]) Labels shape: torch.Size([1, 26142]) Final batch size: 1, sequence length: 24308 Attention mask shape: torch.Size([1, 1, 24308, 24308]) Position ids shape: torch.Size([1, 24308]) Input IDs shape: torch.Size([1, 24308]) Labels shape: torch.Size([1, 24308]) Final batch size: 1, sequence length: 20816 Attention mask shape: torch.Size([1, 1, 20816, 20816]) Position ids shape: torch.Size([1, 20816]) Input IDs shape: torch.Size([1, 20816]) Labels shape: torch.Size([1, 20816]) Final batch size: 1, sequence length: 18991 Attention mask shape: torch.Size([1, 1, 18991, 18991]) Position ids shape: torch.Size([1, 18991]) Input IDs shape: torch.Size([1, 18991]) Labels shape: torch.Size([1, 18991]) Final batch size: 1, sequence length: 26121 Attention mask shape: torch.Size([1, 1, 26121, 26121]) Position ids shape: torch.Size([1, 26121]) Input IDs shape: torch.Size([1, 26121]) Labels shape: torch.Size([1, 26121]) Final batch size: 1, sequence length: 22683 Attention mask shape: torch.Size([1, 1, 22683, 22683]) Position ids shape: torch.Size([1, 22683]) Input IDs shape: torch.Size([1, 22683]) Labels shape: torch.Size([1, 22683]) Final batch size: 1, sequence length: 24515 Attention mask shape: torch.Size([1, 1, 24515, 24515]) Position ids shape: torch.Size([1, 24515]) Input IDs shape: torch.Size([1, 24515]) Labels shape: torch.Size([1, 24515]) Final batch size: 1, sequence length: 27179 Attention mask shape: torch.Size([1, 1, 27179, 27179]) Position ids shape: torch.Size([1, 27179]) Input IDs shape: torch.Size([1, 27179]) Labels shape: torch.Size([1, 27179]) Final batch size: 1, sequence length: 26449 Attention mask shape: torch.Size([1, 1, 26449, 26449]) Position ids shape: torch.Size([1, 26449]) Input IDs shape: torch.Size([1, 26449]) Labels shape: torch.Size([1, 26449]) Final batch size: 1, sequence length: 24433 Attention mask shape: torch.Size([1, 1, 24433, 24433]) Position ids shape: torch.Size([1, 24433]) Input IDs shape: torch.Size([1, 24433]) Labels shape: torch.Size([1, 24433]) Final batch size: 1, sequence length: 16564 Attention mask shape: torch.Size([1, 1, 16564, 16564]) Position ids shape: torch.Size([1, 16564]) Input IDs shape: torch.Size([1, 16564]) Labels shape: torch.Size([1, 16564]) Final batch size: 1, sequence length: 29628 Attention mask shape: torch.Size([1, 1, 29628, 29628]) Position ids shape: torch.Size([1, 29628]) Input IDs shape: torch.Size([1, 29628]) Labels shape: torch.Size([1, 29628]) Final batch size: 1, sequence length: 24802 Attention mask shape: torch.Size([1, 1, 24802, 24802]) Position ids shape: torch.Size([1, 24802]) Input IDs shape: torch.Size([1, 24802]) Labels shape: torch.Size([1, 24802]) Final batch size: 1, sequence length: 18122 Attention mask shape: torch.Size([1, 1, 18122, 18122]) Position ids shape: torch.Size([1, 18122]) Input IDs shape: torch.Size([1, 18122]) Labels shape: torch.Size([1, 18122]) Final batch size: 1, sequence length: 28164 Attention mask shape: torch.Size([1, 1, 28164, 28164]) Position ids shape: torch.Size([1, 28164]) Input IDs shape: torch.Size([1, 28164]) Labels shape: torch.Size([1, 28164]) Final batch size: 1, sequence length: 17363 Attention mask shape: torch.Size([1, 1, 17363, 17363]) Position ids shape: torch.Size([1, 17363]) Input IDs shape: torch.Size([1, 17363]) Labels shape: torch.Size([1, 17363]) Final batch size: 1, sequence length: 31507 Attention mask shape: torch.Size([1, 1, 31507, 31507]) Position ids shape: torch.Size([1, 31507]) Input IDs shape: torch.Size([1, 31507]) Labels shape: torch.Size([1, 31507]) Final batch size: 1, sequence length: 30197 Attention mask shape: torch.Size([1, 1, 30197, 30197]) Position ids shape: torch.Size([1, 30197]) Input IDs shape: torch.Size([1, 30197]) Labels shape: torch.Size([1, 30197]) Final batch size: 1, sequence length: 10857 Attention mask shape: torch.Size([1, 1, 10857, 10857]) Position ids shape: torch.Size([1, 10857]) Input IDs shape: torch.Size([1, 10857]) Labels shape: torch.Size([1, 10857]) Final batch size: 1, sequence length: 21152 Attention mask shape: torch.Size([1, 1, 21152, 21152]) Position ids shape: torch.Size([1, 21152]) Input IDs shape: torch.Size([1, 21152]) Labels shape: torch.Size([1, 21152]) Final batch size: 1, sequence length: 30181 Attention mask shape: torch.Size([1, 1, 30181, 30181]) Position ids shape: torch.Size([1, 30181]) Input IDs shape: torch.Size([1, 30181]) Labels shape: torch.Size([1, 30181]) Final batch size: 1, sequence length: 32752 Attention mask shape: torch.Size([1, 1, 32752, 32752]) Position ids shape: torch.Size([1, 32752]) Input IDs shape: torch.Size([1, 32752]) Labels shape: torch.Size([1, 32752]) Final batch size: 1, sequence length: 24527 Attention mask shape: torch.Size([1, 1, 24527, 24527]) Position ids shape: torch.Size([1, 24527]) Input IDs shape: torch.Size([1, 24527]) Labels shape: torch.Size([1, 24527]) Final batch size: 1, sequence length: 31339 Attention mask shape: torch.Size([1, 1, 31339, 31339]) Position ids shape: torch.Size([1, 31339]) Input IDs shape: torch.Size([1, 31339]) Labels shape: torch.Size([1, 31339]) Final batch size: 1, sequence length: 16837 Attention mask shape: torch.Size([1, 1, 16837, 16837]) Position ids shape: torch.Size([1, 16837]) Input IDs shape: torch.Size([1, 16837]) Labels shape: torch.Size([1, 16837]) Final batch size: 1, sequence length: 32466 Attention mask shape: torch.Size([1, 1, 32466, 32466]) Position ids shape: torch.Size([1, 32466]) Input IDs shape: torch.Size([1, 32466]) Labels shape: torch.Size([1, 32466]) Final batch size: 1, sequence length: 22762 Attention mask shape: torch.Size([1, 1, 22762, 22762]) Position ids shape: torch.Size([1, 22762]) Input IDs shape: torch.Size([1, 22762]) Labels shape: torch.Size([1, 22762]) Final batch size: 1, sequence length: 29392 Attention mask shape: torch.Size([1, 1, 29392, 29392]) Position ids shape: torch.Size([1, 29392]) Input IDs shape: torch.Size([1, 29392]) Labels shape: torch.Size([1, 29392]) Final batch size: 1, sequence length: 34711 Attention mask shape: torch.Size([1, 1, 34711, 34711]) Position ids shape: torch.Size([1, 34711]) Input IDs shape: torch.Size([1, 34711]) Labels shape: torch.Size([1, 34711]) Final batch size: 1, sequence length: 33894 Attention mask shape: torch.Size([1, 1, 33894, 33894]) Position ids shape: torch.Size([1, 33894]) Input IDs shape: torch.Size([1, 33894]) Labels shape: torch.Size([1, 33894]) Final batch size: 1, sequence length: 11107 Attention mask shape: torch.Size([1, 1, 11107, 11107]) Position ids shape: torch.Size([1, 11107]) Input IDs shape: torch.Size([1, 11107]) Labels shape: torch.Size([1, 11107]) Final batch size: 1, sequence length: 31525 Attention mask shape: torch.Size([1, 1, 31525, 31525]) Position ids shape: torch.Size([1, 31525]) Input IDs shape: torch.Size([1, 31525]) Labels shape: torch.Size([1, 31525]) Final batch size: 1, sequence length: 33367 Attention mask shape: torch.Size([1, 1, 33367, 33367]) Position ids shape: torch.Size([1, 33367]) Input IDs shape: torch.Size([1, 33367]) Labels shape: torch.Size([1, 33367]) Final batch size: 1, sequence length: 20274 Attention mask shape: torch.Size([1, 1, 20274, 20274]) Position ids shape: torch.Size([1, 20274]) Input IDs shape: torch.Size([1, 20274]) Labels shape: torch.Size([1, 20274]) Final batch size: 1, sequence length: 34512 Attention mask shape: torch.Size([1, 1, 34512, 34512]) Position ids shape: torch.Size([1, 34512]) Input IDs shape: torch.Size([1, 34512]) Labels shape: torch.Size([1, 34512]) Final batch size: 1, sequence length: 33087 Attention mask shape: torch.Size([1, 1, 33087, 33087]) Position ids shape: torch.Size([1, 33087]) Input IDs shape: torch.Size([1, 33087]) Labels shape: torch.Size([1, 33087]) Final batch size: 1, sequence length: 35523 Attention mask shape: torch.Size([1, 1, 35523, 35523]) Position ids shape: torch.Size([1, 35523]) Input IDs shape: torch.Size([1, 35523]) Labels shape: torch.Size([1, 35523]) Final batch size: 1, sequence length: 21143 Attention mask shape: torch.Size([1, 1, 21143, 21143]) Position ids shape: torch.Size([1, 21143]) Input IDs shape: torch.Size([1, 21143]) Labels shape: torch.Size([1, 21143]) Final batch size: 1, sequence length: 17181 Attention mask shape: torch.Size([1, 1, 17181, 17181]) Position ids shape: torch.Size([1, 17181]) Input IDs shape: torch.Size([1, 17181]) Labels shape: torch.Size([1, 17181]) Final batch size: 1, sequence length: 32891 Attention mask shape: torch.Size([1, 1, 32891, 32891]) Position ids shape: torch.Size([1, 32891]) Input IDs shape: torch.Size([1, 32891]) Labels shape: torch.Size([1, 32891]) Final batch size: 1, sequence length: 27061 Attention mask shape: torch.Size([1, 1, 27061, 27061]) Position ids shape: torch.Size([1, 27061]) Input IDs shape: torch.Size([1, 27061]) Labels shape: torch.Size([1, 27061]) Final batch size: 1, sequence length: 34405 Attention mask shape: torch.Size([1, 1, 34405, 34405]) Position ids shape: torch.Size([1, 34405]) Input IDs shape: torch.Size([1, 34405]) Labels shape: torch.Size([1, 34405]) Final batch size: 1, sequence length: 33794 Attention mask shape: torch.Size([1, 1, 33794, 33794]) Position ids shape: torch.Size([1, 33794]) Input IDs shape: torch.Size([1, 33794]) Labels shape: torch.Size([1, 33794]) Final batch size: 1, sequence length: 13942 Attention mask shape: torch.Size([1, 1, 13942, 13942]) Position ids shape: torch.Size([1, 13942]) Input IDs shape: torch.Size([1, 13942]) Labels shape: torch.Size([1, 13942]) Final batch size: 1, sequence length: 37343 Attention mask shape: torch.Size([1, 1, 37343, 37343]) Position ids shape: torch.Size([1, 37343]) Input IDs shape: torch.Size([1, 37343]) Labels shape: torch.Size([1, 37343]) Final batch size: 1, sequence length: 37555 Attention mask shape: torch.Size([1, 1, 37555, 37555]) Position ids shape: torch.Size([1, 37555]) Input IDs shape: torch.Size([1, 37555]) Labels shape: torch.Size([1, 37555]) Final batch size: 1, sequence length: 38010 Attention mask shape: torch.Size([1, 1, 38010, 38010]) Position ids shape: torch.Size([1, 38010]) Input IDs shape: torch.Size([1, 38010]) Labels shape: torch.Size([1, 38010]) Final batch size: 1, sequence length: 31875 Attention mask shape: torch.Size([1, 1, 31875, 31875]) Position ids shape: torch.Size([1, 31875]) Input IDs shape: torch.Size([1, 31875]) Labels shape: torch.Size([1, 31875]) Final batch size: 1, sequence length: 18229 Attention mask shape: torch.Size([1, 1, 18229, 18229]) Position ids shape: torch.Size([1, 18229]) Input IDs shape: torch.Size([1, 18229]) Labels shape: torch.Size([1, 18229]) Final batch size: 1, sequence length: 31518 Attention mask shape: torch.Size([1, 1, 31518, 31518]) Position ids shape: torch.Size([1, 31518]) Input IDs shape: torch.Size([1, 31518]) Labels shape: torch.Size([1, 31518]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 21859 Attention mask shape: torch.Size([1, 1, 21859, 21859]) Position ids shape: torch.Size([1, 21859]) Input IDs shape: torch.Size([1, 21859]) Labels shape: torch.Size([1, 21859]) Final batch size: 1, sequence length: 31561 Attention mask shape: torch.Size([1, 1, 31561, 31561]) Position ids shape: torch.Size([1, 31561]) Input IDs shape: torch.Size([1, 31561]) Labels shape: torch.Size([1, 31561]) Final batch size: 1, sequence length: 13978 Attention mask shape: torch.Size([1, 1, 13978, 13978]) Position ids shape: torch.Size([1, 13978]) Input IDs shape: torch.Size([1, 13978]) Labels shape: torch.Size([1, 13978]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40910 Attention mask shape: torch.Size([1, 1, 40910, 40910]) Position ids shape: torch.Size([1, 40910]) Input IDs shape: torch.Size([1, 40910]) Labels shape: torch.Size([1, 40910]) Final batch size: 1, sequence length: 36450 Attention mask shape: torch.Size([1, 1, 36450, 36450]) Position ids shape: torch.Size([1, 36450]) Input IDs shape: torch.Size([1, 36450]) Labels shape: torch.Size([1, 36450]) Final batch size: 1, sequence length: 16716 Attention mask shape: torch.Size([1, 1, 16716, 16716]) Position ids shape: torch.Size([1, 16716]) Input IDs shape: torch.Size([1, 16716]) Labels shape: torch.Size([1, 16716]) Final batch size: 1, sequence length: 16433 Attention mask shape: torch.Size([1, 1, 16433, 16433]) Position ids shape: torch.Size([1, 16433]) Input IDs shape: torch.Size([1, 16433]) Labels shape: torch.Size([1, 16433]) Final batch size: 1, sequence length: 29042 Attention mask shape: torch.Size([1, 1, 29042, 29042]) Position ids shape: torch.Size([1, 29042]) Input IDs shape: torch.Size([1, 29042]) Labels shape: torch.Size([1, 29042]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 36464 Attention mask shape: torch.Size([1, 1, 36464, 36464]) Position ids shape: torch.Size([1, 36464]) Input IDs shape: torch.Size([1, 36464]) Labels shape: torch.Size([1, 36464]) Final batch size: 1, sequence length: 19287 Attention mask shape: torch.Size([1, 1, 19287, 19287]) Position ids shape: torch.Size([1, 19287]) Input IDs shape: torch.Size([1, 19287]) Labels shape: torch.Size([1, 19287]) Final batch size: 1, sequence length: 30134 Attention mask shape: torch.Size([1, 1, 30134, 30134]) Position ids shape: torch.Size([1, 30134]) Input IDs shape: torch.Size([1, 30134]) Labels shape: torch.Size([1, 30134]) Final batch size: 1, sequence length: 29216 Attention mask shape: torch.Size([1, 1, 29216, 29216]) Position ids shape: torch.Size([1, 29216]) Input IDs shape: torch.Size([1, 29216]) Labels shape: torch.Size([1, 29216]) Final batch size: 1, sequence length: 37349 Attention mask shape: torch.Size([1, 1, 37349, 37349]) Position ids shape: torch.Size([1, 37349]) Input IDs shape: torch.Size([1, 37349]) Labels shape: torch.Size([1, 37349]) Final batch size: 1, sequence length: 31449 Attention mask shape: torch.Size([1, 1, 31449, 31449]) Position ids shape: torch.Size([1, 31449]) Input IDs shape: torch.Size([1, 31449]) Labels shape: torch.Size([1, 31449]) Final batch size: 1, sequence length: 17758 Attention mask shape: torch.Size([1, 1, 17758, 17758]) Position ids shape: torch.Size([1, 17758]) Input IDs shape: torch.Size([1, 17758]) Labels shape: torch.Size([1, 17758]) Final batch size: 1, sequence length: 26922 Attention mask shape: torch.Size([1, 1, 26922, 26922]) Position ids shape: torch.Size([1, 26922]) Input IDs shape: torch.Size([1, 26922]) Labels shape: torch.Size([1, 26922]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40752 Attention mask shape: torch.Size([1, 1, 40752, 40752]) Position ids shape: torch.Size([1, 40752]) Input IDs shape: torch.Size([1, 40752]) Labels shape: torch.Size([1, 40752]) Final batch size: 1, sequence length: 39169 Attention mask shape: torch.Size([1, 1, 39169, 39169]) Position ids shape: torch.Size([1, 39169]) Input IDs shape: torch.Size([1, 39169]) Labels shape: torch.Size([1, 39169]) Final batch size: 1, sequence length: 26893 Attention mask shape: torch.Size([1, 1, 26893, 26893]) Position ids shape: torch.Size([1, 26893]) Input IDs shape: torch.Size([1, 26893]) Labels shape: torch.Size([1, 26893]) Final batch size: 1, sequence length: 38529 Attention mask shape: torch.Size([1, 1, 38529, 38529]) Position ids shape: torch.Size([1, 38529]) Input IDs shape: torch.Size([1, 38529]) Labels shape: torch.Size([1, 38529]) Final batch size: 1, sequence length: 36599 Attention mask shape: torch.Size([1, 1, 36599, 36599]) Position ids shape: torch.Size([1, 36599]) Input IDs shape: torch.Size([1, 36599]) Labels shape: torch.Size([1, 36599]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 17535 Attention mask shape: torch.Size([1, 1, 17535, 17535]) Position ids shape: torch.Size([1, 17535]) Input IDs shape: torch.Size([1, 17535]) Labels shape: torch.Size([1, 17535]) Final batch size: 1, sequence length: 38891 Attention mask shape: torch.Size([1, 1, 38891, 38891]) Position ids shape: torch.Size([1, 38891]) Input IDs shape: torch.Size([1, 38891]) Labels shape: torch.Size([1, 38891]) Final batch size: 1, sequence length: 32638 Attention mask shape: torch.Size([1, 1, 32638, 32638]) Position ids shape: torch.Size([1, 32638]) Input IDs shape: torch.Size([1, 32638]) Labels shape: torch.Size([1, 32638]) Final batch size: 1, sequence length: 39258 Attention mask shape: torch.Size([1, 1, 39258, 39258]) Position ids shape: torch.Size([1, 39258]) Input IDs shape: torch.Size([1, 39258]) Labels shape: torch.Size([1, 39258]) Final batch size: 1, sequence length: 17890 Attention mask shape: torch.Size([1, 1, 17890, 17890]) Position ids shape: torch.Size([1, 17890]) Input IDs shape: torch.Size([1, 17890]) Labels shape: torch.Size([1, 17890]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 32465 Attention mask shape: torch.Size([1, 1, 32465, 32465]) Position ids shape: torch.Size([1, 32465]) Input IDs shape: torch.Size([1, 32465]) Labels shape: torch.Size([1, 32465]) Final batch size: 1, sequence length: 18471 Attention mask shape: torch.Size([1, 1, 18471, 18471]) Position ids shape: torch.Size([1, 18471]) Input IDs shape: torch.Size([1, 18471]) Labels shape: torch.Size([1, 18471]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 39953 Attention mask shape: torch.Size([1, 1, 39953, 39953]) Position ids shape: torch.Size([1, 39953]) Input IDs shape: torch.Size([1, 39953]) Labels shape: torch.Size([1, 39953]) {'loss': 0.2565, 'grad_norm': 0.31389690471317544, 'learning_rate': 2.966316784621e-06, 'num_tokens': -inf, 'epoch': 5.38} Final batch size: 1, sequence length: 5818 Attention mask shape: torch.Size([1, 1, 5818, 5818]) Position ids shape: torch.Size([1, 5818]) Input IDs shape: torch.Size([1, 5818]) Labels shape: torch.Size([1, 5818]) Final batch size: 1, sequence length: 6215 Attention mask shape: torch.Size([1, 1, 6215, 6215]) Position ids shape: torch.Size([1, 6215]) Input IDs shape: torch.Size([1, 6215]) Labels shape: torch.Size([1, 6215]) Final batch size: 1, sequence length: 7977 Attention mask shape: torch.Size([1, 1, 7977, 7977]) Position ids shape: torch.Size([1, 7977]) Input IDs shape: torch.Size([1, 7977]) Labels shape: torch.Size([1, 7977]) Final batch size: 1, sequence length: 10107 Attention mask shape: torch.Size([1, 1, 10107, 10107]) Position ids shape: torch.Size([1, 10107]) Input IDs shape: torch.Size([1, 10107]) Labels shape: torch.Size([1, 10107]) Final batch size: 1, sequence length: 6095 Attention mask shape: torch.Size([1, 1, 6095, 6095]) Position ids shape: torch.Size([1, 6095]) Input IDs shape: torch.Size([1, 6095]) Labels shape: torch.Size([1, 6095]) Final batch size: 1, sequence length: 10080 Attention mask shape: torch.Size([1, 1, 10080, 10080]) Position ids shape: torch.Size([1, 10080]) Input IDs shape: torch.Size([1, 10080]) Labels shape: torch.Size([1, 10080]) Final batch size: 1, sequence length: 12454 Attention mask shape: torch.Size([1, 1, 12454, 12454]) Position ids shape: torch.Size([1, 12454]) Input IDs shape: torch.Size([1, 12454]) Labels shape: torch.Size([1, 12454]) Final batch size: 1, sequence length: 12826 Attention mask shape: torch.Size([1, 1, 12826, 12826]) Position ids shape: torch.Size([1, 12826]) Input IDs shape: torch.Size([1, 12826]) Labels shape: torch.Size([1, 12826]) Final batch size: 1, sequence length: 10505 Attention mask shape: torch.Size([1, 1, 10505, 10505]) Position ids shape: torch.Size([1, 10505]) Input IDs shape: torch.Size([1, 10505]) Labels shape: torch.Size([1, 10505]) Final batch size: 1, sequence length: 9217 Attention mask shape: torch.Size([1, 1, 9217, 9217]) Position ids shape: torch.Size([1, 9217]) Input IDs shape: torch.Size([1, 9217]) Labels shape: torch.Size([1, 9217]) Final batch size: 1, sequence length: 13092 Attention mask shape: torch.Size([1, 1, 13092, 13092]) Position ids shape: torch.Size([1, 13092]) Input IDs shape: torch.Size([1, 13092]) Labels shape: torch.Size([1, 13092]) Final batch size: 1, sequence length: 8500 Attention mask shape: torch.Size([1, 1, 8500, 8500]) Position ids shape: torch.Size([1, 8500]) Input IDs shape: torch.Size([1, 8500]) Labels shape: torch.Size([1, 8500]) Final batch size: 1, sequence length: 13624 Attention mask shape: torch.Size([1, 1, 13624, 13624]) Position ids shape: torch.Size([1, 13624]) Input IDs shape: torch.Size([1, 13624]) Labels shape: torch.Size([1, 13624]) Final batch size: 1, sequence length: 12556 Attention mask shape: torch.Size([1, 1, 12556, 12556]) Position ids shape: torch.Size([1, 12556]) Input IDs shape: torch.Size([1, 12556]) Labels shape: torch.Size([1, 12556]) Final batch size: 1, sequence length: 10687 Attention mask shape: torch.Size([1, 1, 10687, 10687]) Position ids shape: torch.Size([1, 10687]) Input IDs shape: torch.Size([1, 10687]) Labels shape: torch.Size([1, 10687]) Final batch size: 1, sequence length: 12928 Attention mask shape: torch.Size([1, 1, 12928, 12928]) Position ids shape: torch.Size([1, 12928]) Input IDs shape: torch.Size([1, 12928]) Labels shape: torch.Size([1, 12928]) Final batch size: 1, sequence length: 13459 Attention mask shape: torch.Size([1, 1, 13459, 13459]) Position ids shape: torch.Size([1, 13459]) Input IDs shape: torch.Size([1, 13459]) Labels shape: torch.Size([1, 13459]) Final batch size: 1, sequence length: 16782 Attention mask shape: torch.Size([1, 1, 16782, 16782]) Position ids shape: torch.Size([1, 16782]) Input IDs shape: torch.Size([1, 16782]) Labels shape: torch.Size([1, 16782]) Final batch size: 1, sequence length: 10469 Attention mask shape: torch.Size([1, 1, 10469, 10469]) Position ids shape: torch.Size([1, 10469]) Input IDs shape: torch.Size([1, 10469]) Labels shape: torch.Size([1, 10469]) Final batch size: 1, sequence length: 9379 Attention mask shape: torch.Size([1, 1, 9379, 9379]) Position ids shape: torch.Size([1, 9379]) Input IDs shape: torch.Size([1, 9379]) Labels shape: torch.Size([1, 9379]) Final batch size: 1, sequence length: 10390 Attention mask shape: torch.Size([1, 1, 10390, 10390]) Position ids shape: torch.Size([1, 10390]) Input IDs shape: torch.Size([1, 10390]) Labels shape: torch.Size([1, 10390]) Final batch size: 1, sequence length: 18469 Attention mask shape: torch.Size([1, 1, 18469, 18469]) Position ids shape: torch.Size([1, 18469]) Input IDs shape: torch.Size([1, 18469]) Labels shape: torch.Size([1, 18469]) Final batch size: 1, sequence length: 17417 Attention mask shape: torch.Size([1, 1, 17417, 17417]) Position ids shape: torch.Size([1, 17417]) Input IDs shape: torch.Size([1, 17417]) Labels shape: torch.Size([1, 17417]) Final batch size: 1, sequence length: 17980 Attention mask shape: torch.Size([1, 1, 17980, 17980]) Position ids shape: torch.Size([1, 17980]) Input IDs shape: torch.Size([1, 17980]) Labels shape: torch.Size([1, 17980]) Final batch size: 1, sequence length: 18454 Attention mask shape: torch.Size([1, 1, 18454, 18454]) Position ids shape: torch.Size([1, 18454]) Input IDs shape: torch.Size([1, 18454]) Labels shape: torch.Size([1, 18454]) Final batch size: 1, sequence length: 19671 Attention mask shape: torch.Size([1, 1, 19671, 19671]) Position ids shape: torch.Size([1, 19671]) Input IDs shape: torch.Size([1, 19671]) Labels shape: torch.Size([1, 19671]) Final batch size: 1, sequence length: 15185 Attention mask shape: torch.Size([1, 1, 15185, 15185]) Position ids shape: torch.Size([1, 15185]) Input IDs shape: torch.Size([1, 15185]) Labels shape: torch.Size([1, 15185]) Final batch size: 1, sequence length: 19683 Attention mask shape: torch.Size([1, 1, 19683, 19683]) Position ids shape: torch.Size([1, 19683]) Input IDs shape: torch.Size([1, 19683]) Labels shape: torch.Size([1, 19683]) Final batch size: 1, sequence length: 20243 Attention mask shape: torch.Size([1, 1, 20243, 20243]) Position ids shape: torch.Size([1, 20243]) Input IDs shape: torch.Size([1, 20243]) Labels shape: torch.Size([1, 20243]) Final batch size: 1, sequence length: 10277 Attention mask shape: torch.Size([1, 1, 10277, 10277]) Position ids shape: torch.Size([1, 10277]) Input IDs shape: torch.Size([1, 10277]) Labels shape: torch.Size([1, 10277]) Final batch size: 1, sequence length: 19187 Attention mask shape: torch.Size([1, 1, 19187, 19187]) Position ids shape: torch.Size([1, 19187]) Input IDs shape: torch.Size([1, 19187]) Labels shape: torch.Size([1, 19187]) Final batch size: 1, sequence length: 20191 Attention mask shape: torch.Size([1, 1, 20191, 20191]) Position ids shape: torch.Size([1, 20191]) Input IDs shape: torch.Size([1, 20191]) Labels shape: torch.Size([1, 20191]) Final batch size: 1, sequence length: 14892 Attention mask shape: torch.Size([1, 1, 14892, 14892]) Position ids shape: torch.Size([1, 14892]) Input IDs shape: torch.Size([1, 14892]) Labels shape: torch.Size([1, 14892]) Final batch size: 1, sequence length: 20562 Attention mask shape: torch.Size([1, 1, 20562, 20562]) Position ids shape: torch.Size([1, 20562]) Input IDs shape: torch.Size([1, 20562]) Labels shape: torch.Size([1, 20562]) Final batch size: 1, sequence length: 15263 Attention mask shape: torch.Size([1, 1, 15263, 15263]) Position ids shape: torch.Size([1, 15263]) Input IDs shape: torch.Size([1, 15263]) Labels shape: torch.Size([1, 15263]) Final batch size: 1, sequence length: 16585 Attention mask shape: torch.Size([1, 1, 16585, 16585]) Position ids shape: torch.Size([1, 16585]) Input IDs shape: torch.Size([1, 16585]) Labels shape: torch.Size([1, 16585]) Final batch size: 1, sequence length: 10182 Attention mask shape: torch.Size([1, 1, 10182, 10182]) Position ids shape: torch.Size([1, 10182]) Input IDs shape: torch.Size([1, 10182]) Labels shape: torch.Size([1, 10182]) Final batch size: 1, sequence length: 15339 Attention mask shape: torch.Size([1, 1, 15339, 15339]) Position ids shape: torch.Size([1, 15339]) Input IDs shape: torch.Size([1, 15339]) Labels shape: torch.Size([1, 15339]) Final batch size: 1, sequence length: 22133 Attention mask shape: torch.Size([1, 1, 22133, 22133]) Position ids shape: torch.Size([1, 22133]) Input IDs shape: torch.Size([1, 22133]) Labels shape: torch.Size([1, 22133]) Final batch size: 1, sequence length: 12012 Attention mask shape: torch.Size([1, 1, 12012, 12012]) Position ids shape: torch.Size([1, 12012]) Input IDs shape: torch.Size([1, 12012]) Labels shape: torch.Size([1, 12012]) Final batch size: 1, sequence length: 20432 Attention mask shape: torch.Size([1, 1, 20432, 20432]) Position ids shape: torch.Size([1, 20432]) Input IDs shape: torch.Size([1, 20432]) Labels shape: torch.Size([1, 20432]) Final batch size: 1, sequence length: 24255 Attention mask shape: torch.Size([1, 1, 24255, 24255]) Position ids shape: torch.Size([1, 24255]) Input IDs shape: torch.Size([1, 24255]) Labels shape: torch.Size([1, 24255]) Final batch size: 1, sequence length: 21051 Attention mask shape: torch.Size([1, 1, 21051, 21051]) Position ids shape: torch.Size([1, 21051]) Input IDs shape: torch.Size([1, 21051]) Labels shape: torch.Size([1, 21051]) Final batch size: 1, sequence length: 23238 Attention mask shape: torch.Size([1, 1, 23238, 23238]) Position ids shape: torch.Size([1, 23238]) Input IDs shape: torch.Size([1, 23238]) Labels shape: torch.Size([1, 23238]) Final batch size: 1, sequence length: 24002 Attention mask shape: torch.Size([1, 1, 24002, 24002]) Position ids shape: torch.Size([1, 24002]) Input IDs shape: torch.Size([1, 24002]) Labels shape: torch.Size([1, 24002]) Final batch size: 1, sequence length: 22857 Attention mask shape: torch.Size([1, 1, 22857, 22857]) Position ids shape: torch.Size([1, 22857]) Input IDs shape: torch.Size([1, 22857]) Labels shape: torch.Size([1, 22857]) Final batch size: 1, sequence length: 24769 Attention mask shape: torch.Size([1, 1, 24769, 24769]) Position ids shape: torch.Size([1, 24769]) Input IDs shape: torch.Size([1, 24769]) Labels shape: torch.Size([1, 24769]) Final batch size: 1, sequence length: 25707 Attention mask shape: torch.Size([1, 1, 25707, 25707]) Position ids shape: torch.Size([1, 25707]) Input IDs shape: torch.Size([1, 25707]) Labels shape: torch.Size([1, 25707]) Final batch size: 1, sequence length: 24428 Attention mask shape: torch.Size([1, 1, 24428, 24428]) Position ids shape: torch.Size([1, 24428]) Input IDs shape: torch.Size([1, 24428]) Labels shape: torch.Size([1, 24428]) Final batch size: 1, sequence length: 14556 Attention mask shape: torch.Size([1, 1, 14556, 14556]) Position ids shape: torch.Size([1, 14556]) Input IDs shape: torch.Size([1, 14556]) Labels shape: torch.Size([1, 14556]) Final batch size: 1, sequence length: 16541 Attention mask shape: torch.Size([1, 1, 16541, 16541]) Position ids shape: torch.Size([1, 16541]) Input IDs shape: torch.Size([1, 16541]) Labels shape: torch.Size([1, 16541]) Final batch size: 1, sequence length: 24858 Attention mask shape: torch.Size([1, 1, 24858, 24858]) Position ids shape: torch.Size([1, 24858]) Input IDs shape: torch.Size([1, 24858]) Labels shape: torch.Size([1, 24858]) Final batch size: 1, sequence length: 26312 Attention mask shape: torch.Size([1, 1, 26312, 26312]) Position ids shape: torch.Size([1, 26312]) Input IDs shape: torch.Size([1, 26312]) Labels shape: torch.Size([1, 26312]) Final batch size: 1, sequence length: 21664 Attention mask shape: torch.Size([1, 1, 21664, 21664]) Position ids shape: torch.Size([1, 21664]) Input IDs shape: torch.Size([1, 21664]) Labels shape: torch.Size([1, 21664]) Final batch size: 1, sequence length: 26144 Attention mask shape: torch.Size([1, 1, 26144, 26144]) Position ids shape: torch.Size([1, 26144]) Input IDs shape: torch.Size([1, 26144]) Labels shape: torch.Size([1, 26144]) Final batch size: 1, sequence length: 26033 Attention mask shape: torch.Size([1, 1, 26033, 26033]) Position ids shape: torch.Size([1, 26033]) Input IDs shape: torch.Size([1, 26033]) Labels shape: torch.Size([1, 26033]) Final batch size: 1, sequence length: 25758 Attention mask shape: torch.Size([1, 1, 25758, 25758]) Position ids shape: torch.Size([1, 25758]) Input IDs shape: torch.Size([1, 25758]) Labels shape: torch.Size([1, 25758]) Final batch size: 1, sequence length: 23602 Attention mask shape: torch.Size([1, 1, 23602, 23602]) Position ids shape: torch.Size([1, 23602]) Input IDs shape: torch.Size([1, 23602]) Labels shape: torch.Size([1, 23602]) Final batch size: 1, sequence length: 25886 Attention mask shape: torch.Size([1, 1, 25886, 25886]) Position ids shape: torch.Size([1, 25886]) Input IDs shape: torch.Size([1, 25886]) Labels shape: torch.Size([1, 25886]) Final batch size: 1, sequence length: 25656 Attention mask shape: torch.Size([1, 1, 25656, 25656]) Position ids shape: torch.Size([1, 25656]) Input IDs shape: torch.Size([1, 25656]) Labels shape: torch.Size([1, 25656]) Final batch size: 1, sequence length: 30079 Attention mask shape: torch.Size([1, 1, 30079, 30079]) Position ids shape: torch.Size([1, 30079]) Input IDs shape: torch.Size([1, 30079]) Labels shape: torch.Size([1, 30079]) Final batch size: 1, sequence length: 10318 Attention mask shape: torch.Size([1, 1, 10318, 10318]) Position ids shape: torch.Size([1, 10318]) Input IDs shape: torch.Size([1, 10318]) Labels shape: torch.Size([1, 10318]) Final batch size: 1, sequence length: 19238 Attention mask shape: torch.Size([1, 1, 19238, 19238]) Position ids shape: torch.Size([1, 19238]) Input IDs shape: torch.Size([1, 19238]) Labels shape: torch.Size([1, 19238]) Final batch size: 1, sequence length: 23698 Attention mask shape: torch.Size([1, 1, 23698, 23698]) Position ids shape: torch.Size([1, 23698]) Input IDs shape: torch.Size([1, 23698]) Labels shape: torch.Size([1, 23698]) Final batch size: 1, sequence length: 29948 Attention mask shape: torch.Size([1, 1, 29948, 29948]) Position ids shape: torch.Size([1, 29948]) Input IDs shape: torch.Size([1, 29948]) Labels shape: torch.Size([1, 29948]) Final batch size: 1, sequence length: 29825 Attention mask shape: torch.Size([1, 1, 29825, 29825]) Position ids shape: torch.Size([1, 29825]) Input IDs shape: torch.Size([1, 29825]) Labels shape: torch.Size([1, 29825]) Final batch size: 1, sequence length: 25719 Attention mask shape: torch.Size([1, 1, 25719, 25719]) Position ids shape: torch.Size([1, 25719]) Input IDs shape: torch.Size([1, 25719]) Labels shape: torch.Size([1, 25719]) Final batch size: 1, sequence length: 15604 Attention mask shape: torch.Size([1, 1, 15604, 15604]) Position ids shape: torch.Size([1, 15604]) Input IDs shape: torch.Size([1, 15604]) Labels shape: torch.Size([1, 15604]) Final batch size: 1, sequence length: 32541 Attention mask shape: torch.Size([1, 1, 32541, 32541]) Position ids shape: torch.Size([1, 32541]) Input IDs shape: torch.Size([1, 32541]) Labels shape: torch.Size([1, 32541]) Final batch size: 1, sequence length: 29749 Attention mask shape: torch.Size([1, 1, 29749, 29749]) Position ids shape: torch.Size([1, 29749]) Input IDs shape: torch.Size([1, 29749]) Labels shape: torch.Size([1, 29749]) Final batch size: 1, sequence length: 32071 Attention mask shape: torch.Size([1, 1, 32071, 32071]) Position ids shape: torch.Size([1, 32071]) Input IDs shape: torch.Size([1, 32071]) Labels shape: torch.Size([1, 32071]) Final batch size: 1, sequence length: 25492 Attention mask shape: torch.Size([1, 1, 25492, 25492]) Position ids shape: torch.Size([1, 25492]) Input IDs shape: torch.Size([1, 25492]) Labels shape: torch.Size([1, 25492]) Final batch size: 1, sequence length: 21374 Attention mask shape: torch.Size([1, 1, 21374, 21374]) Position ids shape: torch.Size([1, 21374]) Input IDs shape: torch.Size([1, 21374]) Labels shape: torch.Size([1, 21374]) Final batch size: 1, sequence length: 22139 Attention mask shape: torch.Size([1, 1, 22139, 22139]) Position ids shape: torch.Size([1, 22139]) Input IDs shape: torch.Size([1, 22139]) Labels shape: torch.Size([1, 22139]) Final batch size: 1, sequence length: 15294 Attention mask shape: torch.Size([1, 1, 15294, 15294]) Position ids shape: torch.Size([1, 15294]) Input IDs shape: torch.Size([1, 15294]) Labels shape: torch.Size([1, 15294]) Final batch size: 1, sequence length: 23399 Attention mask shape: torch.Size([1, 1, 23399, 23399]) Position ids shape: torch.Size([1, 23399]) Input IDs shape: torch.Size([1, 23399]) Labels shape: torch.Size([1, 23399]) Final batch size: 1, sequence length: 33611 Attention mask shape: torch.Size([1, 1, 33611, 33611]) Position ids shape: torch.Size([1, 33611]) Input IDs shape: torch.Size([1, 33611]) Labels shape: torch.Size([1, 33611]) Final batch size: 1, sequence length: 35760 Attention mask shape: torch.Size([1, 1, 35760, 35760]) Position ids shape: torch.Size([1, 35760]) Input IDs shape: torch.Size([1, 35760]) Labels shape: torch.Size([1, 35760]) Final batch size: 1, sequence length: 9897 Final batch size: 1, sequence length: 34701 Attention mask shape: torch.Size([1, 1, 9897, 9897]) Position ids shape: torch.Size([1, 9897]) Input IDs shape: torch.Size([1, 9897]) Labels shape: torch.Size([1, 9897]) Attention mask shape: torch.Size([1, 1, 34701, 34701]) Position ids shape: torch.Size([1, 34701]) Input IDs shape: torch.Size([1, 34701]) Labels shape: torch.Size([1, 34701]) Final batch size: 1, sequence length: 21557 Attention mask shape: torch.Size([1, 1, 21557, 21557]) Position ids shape: torch.Size([1, 21557]) Input IDs shape: torch.Size([1, 21557]) Labels shape: torch.Size([1, 21557]) Final batch size: 1, sequence length: 34186 Attention mask shape: torch.Size([1, 1, 34186, 34186]) Position ids shape: torch.Size([1, 34186]) Input IDs shape: torch.Size([1, 34186]) Labels shape: torch.Size([1, 34186]) Final batch size: 1, sequence length: 36778 Attention mask shape: torch.Size([1, 1, 36778, 36778]) Position ids shape: torch.Size([1, 36778]) Input IDs shape: torch.Size([1, 36778]) Labels shape: torch.Size([1, 36778]) Final batch size: 1, sequence length: 37728 Attention mask shape: torch.Size([1, 1, 37728, 37728]) Position ids shape: torch.Size([1, 37728]) Input IDs shape: torch.Size([1, 37728]) Labels shape: torch.Size([1, 37728]) Final batch size: 1, sequence length: 37241 Attention mask shape: torch.Size([1, 1, 37241, 37241]) Position ids shape: torch.Size([1, 37241]) Input IDs shape: torch.Size([1, 37241]) Labels shape: torch.Size([1, 37241]) Final batch size: 1, sequence length: 33459 Attention mask shape: torch.Size([1, 1, 33459, 33459]) Position ids shape: torch.Size([1, 33459]) Input IDs shape: torch.Size([1, 33459]) Labels shape: torch.Size([1, 33459]) Final batch size: 1, sequence length: 7681 Attention mask shape: torch.Size([1, 1, 7681, 7681]) Position ids shape: torch.Size([1, 7681]) Input IDs shape: torch.Size([1, 7681]) Labels shape: torch.Size([1, 7681]) Final batch size: 1, sequence length: 36456 Attention mask shape: torch.Size([1, 1, 36456, 36456]) Position ids shape: torch.Size([1, 36456]) Input IDs shape: torch.Size([1, 36456]) Labels shape: torch.Size([1, 36456]) Final batch size: 1, sequence length: 40593 Attention mask shape: torch.Size([1, 1, 40593, 40593]) Position ids shape: torch.Size([1, 40593]) Input IDs shape: torch.Size([1, 40593]) Labels shape: torch.Size([1, 40593]) Final batch size: 1, sequence length: 40147 Attention mask shape: torch.Size([1, 1, 40147, 40147]) Position ids shape: torch.Size([1, 40147]) Input IDs shape: torch.Size([1, 40147]) Labels shape: torch.Size([1, 40147]) Final batch size: 1, sequence length: 23343 Attention mask shape: torch.Size([1, 1, 23343, 23343]) Position ids shape: torch.Size([1, 23343]) Input IDs shape: torch.Size([1, 23343]) Labels shape: torch.Size([1, 23343]) Final batch size: 1, sequence length: 31650 Attention mask shape: torch.Size([1, 1, 31650, 31650]) Position ids shape: torch.Size([1, 31650]) Input IDs shape: torch.Size([1, 31650]) Labels shape: torch.Size([1, 31650]) Final batch size: 1, sequence length: 31323 Attention mask shape: torch.Size([1, 1, 31323, 31323]) Position ids shape: torch.Size([1, 31323]) Input IDs shape: torch.Size([1, 31323]) Labels shape: torch.Size([1, 31323]) Final batch size: 1, sequence length: 21955 Attention mask shape: torch.Size([1, 1, 21955, 21955]) Position ids shape: torch.Size([1, 21955]) Input IDs shape: torch.Size([1, 21955]) Labels shape: torch.Size([1, 21955]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 39760 Attention mask shape: torch.Size([1, 1, 39760, 39760]) Position ids shape: torch.Size([1, 39760]) Input IDs shape: torch.Size([1, 39760]) Labels shape: torch.Size([1, 39760]) Final batch size: 1, sequence length: 27283 Attention mask shape: torch.Size([1, 1, 27283, 27283]) Position ids shape: torch.Size([1, 27283]) Input IDs shape: torch.Size([1, 27283]) Labels shape: torch.Size([1, 27283]) Final batch size: 1, sequence length: 9445 Attention mask shape: torch.Size([1, 1, 9445, 9445]) Position ids shape: torch.Size([1, 9445]) Input IDs shape: torch.Size([1, 9445]) Labels shape: torch.Size([1, 9445]) Final batch size: 1, sequence length: 19867 Attention mask shape: torch.Size([1, 1, 19867, 19867]) Position ids shape: torch.Size([1, 19867]) Input IDs shape: torch.Size([1, 19867]) Labels shape: torch.Size([1, 19867]) Final batch size: 1, sequence length: 22434 Attention mask shape: torch.Size([1, 1, 22434, 22434]) Position ids shape: torch.Size([1, 22434]) Input IDs shape: torch.Size([1, 22434]) Labels shape: torch.Size([1, 22434]) Final batch size: 1, sequence length: 22491 Attention mask shape: torch.Size([1, 1, 22491, 22491]) Position ids shape: torch.Size([1, 22491]) Input IDs shape: torch.Size([1, 22491]) Labels shape: torch.Size([1, 22491]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 26664 Attention mask shape: torch.Size([1, 1, 26664, 26664]) Position ids shape: torch.Size([1, 26664]) Input IDs shape: torch.Size([1, 26664]) Labels shape: torch.Size([1, 26664]) Final batch size: 1, sequence length: 40507 Attention mask shape: torch.Size([1, 1, 40507, 40507]) Position ids shape: torch.Size([1, 40507]) Input IDs shape: torch.Size([1, 40507]) Labels shape: torch.Size([1, 40507]) Final batch size: 1, sequence length: 37183 Attention mask shape: torch.Size([1, 1, 37183, 37183]) Position ids shape: torch.Size([1, 37183]) Input IDs shape: torch.Size([1, 37183]) Labels shape: torch.Size([1, 37183]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 17646 Attention mask shape: torch.Size([1, 1, 17646, 17646]) Position ids shape: torch.Size([1, 17646]) Input IDs shape: torch.Size([1, 17646]) Labels shape: torch.Size([1, 17646]) Final batch size: 1, sequence length: 21489 Attention mask shape: torch.Size([1, 1, 21489, 21489]) Position ids shape: torch.Size([1, 21489]) Input IDs shape: torch.Size([1, 21489]) Labels shape: torch.Size([1, 21489]) Final batch size: 1, sequence length: 29082 Attention mask shape: torch.Size([1, 1, 29082, 29082]) Position ids shape: torch.Size([1, 29082]) Input IDs shape: torch.Size([1, 29082]) Labels shape: torch.Size([1, 29082]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 37775 Attention mask shape: torch.Size([1, 1, 37775, 37775]) Position ids shape: torch.Size([1, 37775]) Input IDs shape: torch.Size([1, 37775]) Labels shape: torch.Size([1, 37775]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 23014 Attention mask shape: torch.Size([1, 1, 23014, 23014]) Position ids shape: torch.Size([1, 23014]) Input IDs shape: torch.Size([1, 23014]) Labels shape: torch.Size([1, 23014]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 28661 Attention mask shape: torch.Size([1, 1, 28661, 28661]) Position ids shape: torch.Size([1, 28661]) Input IDs shape: torch.Size([1, 28661]) Labels shape: torch.Size([1, 28661]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 16273 Attention mask shape: torch.Size([1, 1, 16273, 16273]) Position ids shape: torch.Size([1, 16273]) Input IDs shape: torch.Size([1, 16273]) Labels shape: torch.Size([1, 16273]) Final batch size: 1, sequence length: 38686 Attention mask shape: torch.Size([1, 1, 38686, 38686]) Position ids shape: torch.Size([1, 38686]) Input IDs shape: torch.Size([1, 38686]) Labels shape: torch.Size([1, 38686]) Final batch size: 1, sequence length: 36579 Attention mask shape: torch.Size([1, 1, 36579, 36579]) Position ids shape: torch.Size([1, 36579]) Input IDs shape: torch.Size([1, 36579]) Labels shape: torch.Size([1, 36579]) Final batch size: 1, sequence length: 37091 Attention mask shape: torch.Size([1, 1, 37091, 37091]) Position ids shape: torch.Size([1, 37091]) Input IDs shape: torch.Size([1, 37091]) Labels shape: torch.Size([1, 37091]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 33263 Attention mask shape: torch.Size([1, 1, 33263, 33263]) Position ids shape: torch.Size([1, 33263]) Input IDs shape: torch.Size([1, 33263]) Labels shape: torch.Size([1, 33263]) Final batch size: 1, sequence length: 25914 Attention mask shape: torch.Size([1, 1, 25914, 25914]) Position ids shape: torch.Size([1, 25914]) Input IDs shape: torch.Size([1, 25914]) Labels shape: torch.Size([1, 25914]) Final batch size: 1, sequence length: 30364 Attention mask shape: torch.Size([1, 1, 30364, 30364]) Position ids shape: torch.Size([1, 30364]) Input IDs shape: torch.Size([1, 30364]) Labels shape: torch.Size([1, 30364]) Final batch size: 1, sequence length: 36297 Attention mask shape: torch.Size([1, 1, 36297, 36297]) Position ids shape: torch.Size([1, 36297]) Input IDs shape: torch.Size([1, 36297]) Labels shape: torch.Size([1, 36297]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) {'loss': 0.2588, 'grad_norm': 0.28413826831120637, 'learning_rate': 2.7300475013022666e-06, 'num_tokens': -inf, 'epoch': 5.5} Final batch size: 1, sequence length: 6519 Attention mask shape: torch.Size([1, 1, 6519, 6519]) Position ids shape: torch.Size([1, 6519]) Input IDs shape: torch.Size([1, 6519]) Labels shape: torch.Size([1, 6519]) Final batch size: 1, sequence length: 5525 Attention mask shape: torch.Size([1, 1, 5525, 5525]) Position ids shape: torch.Size([1, 5525]) Input IDs shape: torch.Size([1, 5525]) Labels shape: torch.Size([1, 5525]) Final batch size: 1, sequence length: 10273 Attention mask shape: torch.Size([1, 1, 10273, 10273]) Position ids shape: torch.Size([1, 10273]) Input IDs shape: torch.Size([1, 10273]) Labels shape: torch.Size([1, 10273]) Final batch size: 1, sequence length: 9181 Attention mask shape: torch.Size([1, 1, 9181, 9181]) Position ids shape: torch.Size([1, 9181]) Input IDs shape: torch.Size([1, 9181]) Labels shape: torch.Size([1, 9181]) Final batch size: 1, sequence length: 10408 Attention mask shape: torch.Size([1, 1, 10408, 10408]) Position ids shape: torch.Size([1, 10408]) Input IDs shape: torch.Size([1, 10408]) Labels shape: torch.Size([1, 10408]) Final batch size: 1, sequence length: 13385 Attention mask shape: torch.Size([1, 1, 13385, 13385]) Position ids shape: torch.Size([1, 13385]) Input IDs shape: torch.Size([1, 13385]) Labels shape: torch.Size([1, 13385]) Final batch size: 1, sequence length: 12927 Attention mask shape: torch.Size([1, 1, 12927, 12927]) Position ids shape: torch.Size([1, 12927]) Input IDs shape: torch.Size([1, 12927]) Labels shape: torch.Size([1, 12927]) Final batch size: 1, sequence length: 12281 Attention mask shape: torch.Size([1, 1, 12281, 12281]) Position ids shape: torch.Size([1, 12281]) Input IDs shape: torch.Size([1, 12281]) Labels shape: torch.Size([1, 12281]) Final batch size: 1, sequence length: 13804 Attention mask shape: torch.Size([1, 1, 13804, 13804]) Position ids shape: torch.Size([1, 13804]) Input IDs shape: torch.Size([1, 13804]) Labels shape: torch.Size([1, 13804]) Final batch size: 1, sequence length: 15257 Attention mask shape: torch.Size([1, 1, 15257, 15257]) Position ids shape: torch.Size([1, 15257]) Input IDs shape: torch.Size([1, 15257]) Labels shape: torch.Size([1, 15257]) Final batch size: 1, sequence length: 13363 Attention mask shape: torch.Size([1, 1, 13363, 13363]) Position ids shape: torch.Size([1, 13363]) Input IDs shape: torch.Size([1, 13363]) Labels shape: torch.Size([1, 13363]) Final batch size: 1, sequence length: 15518 Attention mask shape: torch.Size([1, 1, 15518, 15518]) Position ids shape: torch.Size([1, 15518]) Input IDs shape: torch.Size([1, 15518]) Labels shape: torch.Size([1, 15518]) Final batch size: 1, sequence length: 10905 Attention mask shape: torch.Size([1, 1, 10905, 10905]) Position ids shape: torch.Size([1, 10905]) Input IDs shape: torch.Size([1, 10905]) Labels shape: torch.Size([1, 10905]) Final batch size: 1, sequence length: 17003 Attention mask shape: torch.Size([1, 1, 17003, 17003]) Position ids shape: torch.Size([1, 17003]) Input IDs shape: torch.Size([1, 17003]) Labels shape: torch.Size([1, 17003]) Final batch size: 1, sequence length: 15244 Attention mask shape: torch.Size([1, 1, 15244, 15244]) Position ids shape: torch.Size([1, 15244]) Input IDs shape: torch.Size([1, 15244]) Labels shape: torch.Size([1, 15244]) Final batch size: 1, sequence length: 16520 Attention mask shape: torch.Size([1, 1, 16520, 16520]) Position ids shape: torch.Size([1, 16520]) Input IDs shape: torch.Size([1, 16520]) Labels shape: torch.Size([1, 16520]) Final batch size: 1, sequence length: 8646 Attention mask shape: torch.Size([1, 1, 8646, 8646]) Position ids shape: torch.Size([1, 8646]) Input IDs shape: torch.Size([1, 8646]) Labels shape: torch.Size([1, 8646]) Final batch size: 1, sequence length: 20089 Attention mask shape: torch.Size([1, 1, 20089, 20089]) Position ids shape: torch.Size([1, 20089]) Input IDs shape: torch.Size([1, 20089]) Labels shape: torch.Size([1, 20089]) Final batch size: 1, sequence length: 19286 Attention mask shape: torch.Size([1, 1, 19286, 19286]) Position ids shape: torch.Size([1, 19286]) Input IDs shape: torch.Size([1, 19286]) Labels shape: torch.Size([1, 19286]) Final batch size: 1, sequence length: 18645 Attention mask shape: torch.Size([1, 1, 18645, 18645]) Position ids shape: torch.Size([1, 18645]) Input IDs shape: torch.Size([1, 18645]) Labels shape: torch.Size([1, 18645]) Final batch size: 1, sequence length: 19768 Attention mask shape: torch.Size([1, 1, 19768, 19768]) Position ids shape: torch.Size([1, 19768]) Input IDs shape: torch.Size([1, 19768]) Labels shape: torch.Size([1, 19768]) Final batch size: 1, sequence length: 14429 Attention mask shape: torch.Size([1, 1, 14429, 14429]) Position ids shape: torch.Size([1, 14429]) Input IDs shape: torch.Size([1, 14429]) Labels shape: torch.Size([1, 14429]) Final batch size: 1, sequence length: 19513 Attention mask shape: torch.Size([1, 1, 19513, 19513]) Position ids shape: torch.Size([1, 19513]) Input IDs shape: torch.Size([1, 19513]) Labels shape: torch.Size([1, 19513]) Final batch size: 1, sequence length: 5405 Attention mask shape: torch.Size([1, 1, 5405, 5405]) Position ids shape: torch.Size([1, 5405]) Input IDs shape: torch.Size([1, 5405]) Labels shape: torch.Size([1, 5405]) Final batch size: 1, sequence length: 22777 Attention mask shape: torch.Size([1, 1, 22777, 22777]) Position ids shape: torch.Size([1, 22777]) Input IDs shape: torch.Size([1, 22777]) Labels shape: torch.Size([1, 22777]) Final batch size: 1, sequence length: 20979 Attention mask shape: torch.Size([1, 1, 20979, 20979]) Position ids shape: torch.Size([1, 20979]) Input IDs shape: torch.Size([1, 20979]) Labels shape: torch.Size([1, 20979]) Final batch size: 1, sequence length: 22915 Attention mask shape: torch.Size([1, 1, 22915, 22915]) Position ids shape: torch.Size([1, 22915]) Input IDs shape: torch.Size([1, 22915]) Labels shape: torch.Size([1, 22915]) Final batch size: 1, sequence length: 23942 Attention mask shape: torch.Size([1, 1, 23942, 23942]) Position ids shape: torch.Size([1, 23942]) Input IDs shape: torch.Size([1, 23942]) Labels shape: torch.Size([1, 23942]) Final batch size: 1, sequence length: 25001 Attention mask shape: torch.Size([1, 1, 25001, 25001]) Position ids shape: torch.Size([1, 25001]) Input IDs shape: torch.Size([1, 25001]) Labels shape: torch.Size([1, 25001]) Final batch size: 1, sequence length: 18819 Attention mask shape: torch.Size([1, 1, 18819, 18819]) Position ids shape: torch.Size([1, 18819]) Input IDs shape: torch.Size([1, 18819]) Labels shape: torch.Size([1, 18819]) Final batch size: 1, sequence length: 20106 Attention mask shape: torch.Size([1, 1, 20106, 20106]) Position ids shape: torch.Size([1, 20106]) Input IDs shape: torch.Size([1, 20106]) Labels shape: torch.Size([1, 20106]) Final batch size: 1, sequence length: 18988 Attention mask shape: torch.Size([1, 1, 18988, 18988]) Position ids shape: torch.Size([1, 18988]) Input IDs shape: torch.Size([1, 18988]) Labels shape: torch.Size([1, 18988]) Final batch size: 1, sequence length: 22160 Attention mask shape: torch.Size([1, 1, 22160, 22160]) Position ids shape: torch.Size([1, 22160]) Input IDs shape: torch.Size([1, 22160]) Labels shape: torch.Size([1, 22160]) Final batch size: 1, sequence length: 23886 Attention mask shape: torch.Size([1, 1, 23886, 23886]) Position ids shape: torch.Size([1, 23886]) Input IDs shape: torch.Size([1, 23886]) Labels shape: torch.Size([1, 23886]) Final batch size: 1, sequence length: 25021 Attention mask shape: torch.Size([1, 1, 25021, 25021]) Position ids shape: torch.Size([1, 25021]) Input IDs shape: torch.Size([1, 25021]) Labels shape: torch.Size([1, 25021]) Final batch size: 1, sequence length: 22979 Attention mask shape: torch.Size([1, 1, 22979, 22979]) Position ids shape: torch.Size([1, 22979]) Input IDs shape: torch.Size([1, 22979]) Labels shape: torch.Size([1, 22979]) Final batch size: 1, sequence length: 23995 Attention mask shape: torch.Size([1, 1, 23995, 23995]) Position ids shape: torch.Size([1, 23995]) Input IDs shape: torch.Size([1, 23995]) Labels shape: torch.Size([1, 23995]) Final batch size: 1, sequence length: 23724 Attention mask shape: torch.Size([1, 1, 23724, 23724]) Position ids shape: torch.Size([1, 23724]) Input IDs shape: torch.Size([1, 23724]) Labels shape: torch.Size([1, 23724]) Final batch size: 1, sequence length: 21660 Attention mask shape: torch.Size([1, 1, 21660, 21660]) Position ids shape: torch.Size([1, 21660]) Input IDs shape: torch.Size([1, 21660]) Labels shape: torch.Size([1, 21660]) Final batch size: 1, sequence length: 23341 Attention mask shape: torch.Size([1, 1, 23341, 23341]) Position ids shape: torch.Size([1, 23341]) Input IDs shape: torch.Size([1, 23341]) Labels shape: torch.Size([1, 23341]) Final batch size: 1, sequence length: 22311 Attention mask shape: torch.Size([1, 1, 22311, 22311]) Position ids shape: torch.Size([1, 22311]) Input IDs shape: torch.Size([1, 22311]) Labels shape: torch.Size([1, 22311]) Final batch size: 1, sequence length: 5288 Attention mask shape: torch.Size([1, 1, 5288, 5288]) Position ids shape: torch.Size([1, 5288]) Input IDs shape: torch.Size([1, 5288]) Labels shape: torch.Size([1, 5288]) Final batch size: 1, sequence length: 24287 Attention mask shape: torch.Size([1, 1, 24287, 24287]) Position ids shape: torch.Size([1, 24287]) Input IDs shape: torch.Size([1, 24287]) Labels shape: torch.Size([1, 24287]) Final batch size: 1, sequence length: 24293 Attention mask shape: torch.Size([1, 1, 24293, 24293]) Position ids shape: torch.Size([1, 24293]) Input IDs shape: torch.Size([1, 24293]) Labels shape: torch.Size([1, 24293]) Final batch size: 1, sequence length: 24407 Attention mask shape: torch.Size([1, 1, 24407, 24407]) Position ids shape: torch.Size([1, 24407]) Input IDs shape: torch.Size([1, 24407]) Labels shape: torch.Size([1, 24407]) Final batch size: 1, sequence length: 26461 Attention mask shape: torch.Size([1, 1, 26461, 26461]) Position ids shape: torch.Size([1, 26461]) Input IDs shape: torch.Size([1, 26461]) Labels shape: torch.Size([1, 26461]) Final batch size: 1, sequence length: 28263 Attention mask shape: torch.Size([1, 1, 28263, 28263]) Position ids shape: torch.Size([1, 28263]) Input IDs shape: torch.Size([1, 28263]) Labels shape: torch.Size([1, 28263]) Final batch size: 1, sequence length: 26886 Attention mask shape: torch.Size([1, 1, 26886, 26886]) Position ids shape: torch.Size([1, 26886]) Input IDs shape: torch.Size([1, 26886]) Labels shape: torch.Size([1, 26886]) Final batch size: 1, sequence length: 11266 Attention mask shape: torch.Size([1, 1, 11266, 11266]) Position ids shape: torch.Size([1, 11266]) Input IDs shape: torch.Size([1, 11266]) Labels shape: torch.Size([1, 11266]) Final batch size: 1, sequence length: 29109 Attention mask shape: torch.Size([1, 1, 29109, 29109]) Position ids shape: torch.Size([1, 29109]) Input IDs shape: torch.Size([1, 29109]) Labels shape: torch.Size([1, 29109]) Final batch size: 1, sequence length: 12483 Attention mask shape: torch.Size([1, 1, 12483, 12483]) Position ids shape: torch.Size([1, 12483]) Input IDs shape: torch.Size([1, 12483]) Labels shape: torch.Size([1, 12483]) Final batch size: 1, sequence length: 26179 Attention mask shape: torch.Size([1, 1, 26179, 26179]) Position ids shape: torch.Size([1, 26179]) Input IDs shape: torch.Size([1, 26179]) Labels shape: torch.Size([1, 26179]) Final batch size: 1, sequence length: 29009 Attention mask shape: torch.Size([1, 1, 29009, 29009]) Position ids shape: torch.Size([1, 29009]) Input IDs shape: torch.Size([1, 29009]) Labels shape: torch.Size([1, 29009]) Final batch size: 1, sequence length: 29152 Attention mask shape: torch.Size([1, 1, 29152, 29152]) Position ids shape: torch.Size([1, 29152]) Input IDs shape: torch.Size([1, 29152]) Labels shape: torch.Size([1, 29152]) Final batch size: 1, sequence length: 30723 Attention mask shape: torch.Size([1, 1, 30723, 30723]) Position ids shape: torch.Size([1, 30723]) Input IDs shape: torch.Size([1, 30723]) Labels shape: torch.Size([1, 30723]) Final batch size: 1, sequence length: 26054 Attention mask shape: torch.Size([1, 1, 26054, 26054]) Position ids shape: torch.Size([1, 26054]) Input IDs shape: torch.Size([1, 26054]) Labels shape: torch.Size([1, 26054]) Final batch size: 1, sequence length: 32083 Attention mask shape: torch.Size([1, 1, 32083, 32083]) Position ids shape: torch.Size([1, 32083]) Input IDs shape: torch.Size([1, 32083]) Labels shape: torch.Size([1, 32083]) Final batch size: 1, sequence length: 20941 Attention mask shape: torch.Size([1, 1, 20941, 20941]) Position ids shape: torch.Size([1, 20941]) Input IDs shape: torch.Size([1, 20941]) Labels shape: torch.Size([1, 20941]) Final batch size: 1, sequence length: 28440 Attention mask shape: torch.Size([1, 1, 28440, 28440]) Position ids shape: torch.Size([1, 28440]) Input IDs shape: torch.Size([1, 28440]) Labels shape: torch.Size([1, 28440]) Final batch size: 1, sequence length: 30236 Attention mask shape: torch.Size([1, 1, 30236, 30236]) Position ids shape: torch.Size([1, 30236]) Input IDs shape: torch.Size([1, 30236]) Labels shape: torch.Size([1, 30236]) Final batch size: 1, sequence length: 13600 Attention mask shape: torch.Size([1, 1, 13600, 13600]) Position ids shape: torch.Size([1, 13600]) Input IDs shape: torch.Size([1, 13600]) Labels shape: torch.Size([1, 13600]) Final batch size: 1, sequence length: 27405 Attention mask shape: torch.Size([1, 1, 27405, 27405]) Position ids shape: torch.Size([1, 27405]) Input IDs shape: torch.Size([1, 27405]) Labels shape: torch.Size([1, 27405]) Final batch size: 1, sequence length: 17951 Attention mask shape: torch.Size([1, 1, 17951, 17951]) Position ids shape: torch.Size([1, 17951]) Input IDs shape: torch.Size([1, 17951]) Labels shape: torch.Size([1, 17951]) Final batch size: 1, sequence length: 32799 Attention mask shape: torch.Size([1, 1, 32799, 32799]) Position ids shape: torch.Size([1, 32799]) Input IDs shape: torch.Size([1, 32799]) Labels shape: torch.Size([1, 32799]) Final batch size: 1, sequence length: 32660 Attention mask shape: torch.Size([1, 1, 32660, 32660]) Position ids shape: torch.Size([1, 32660]) Input IDs shape: torch.Size([1, 32660]) Labels shape: torch.Size([1, 32660]) Final batch size: 1, sequence length: 17649 Attention mask shape: torch.Size([1, 1, 17649, 17649]) Position ids shape: torch.Size([1, 17649]) Input IDs shape: torch.Size([1, 17649]) Labels shape: torch.Size([1, 17649]) Final batch size: 1, sequence length: 29561 Attention mask shape: torch.Size([1, 1, 29561, 29561]) Position ids shape: torch.Size([1, 29561]) Input IDs shape: torch.Size([1, 29561]) Labels shape: torch.Size([1, 29561]) Final batch size: 1, sequence length: 15830 Attention mask shape: torch.Size([1, 1, 15830, 15830]) Position ids shape: torch.Size([1, 15830]) Input IDs shape: torch.Size([1, 15830]) Labels shape: torch.Size([1, 15830]) Final batch size: 1, sequence length: 17539 Attention mask shape: torch.Size([1, 1, 17539, 17539]) Position ids shape: torch.Size([1, 17539]) Input IDs shape: torch.Size([1, 17539]) Labels shape: torch.Size([1, 17539]) Final batch size: 1, sequence length: 21472 Attention mask shape: torch.Size([1, 1, 21472, 21472]) Position ids shape: torch.Size([1, 21472]) Input IDs shape: torch.Size([1, 21472]) Labels shape: torch.Size([1, 21472]) Final batch size: 1, sequence length: 32287 Attention mask shape: torch.Size([1, 1, 32287, 32287]) Position ids shape: torch.Size([1, 32287]) Input IDs shape: torch.Size([1, 32287]) Labels shape: torch.Size([1, 32287]) Final batch size: 1, sequence length: 29237 Attention mask shape: torch.Size([1, 1, 29237, 29237]) Position ids shape: torch.Size([1, 29237]) Input IDs shape: torch.Size([1, 29237]) Labels shape: torch.Size([1, 29237]) Final batch size: 1, sequence length: 15906 Attention mask shape: torch.Size([1, 1, 15906, 15906]) Position ids shape: torch.Size([1, 15906]) Input IDs shape: torch.Size([1, 15906]) Labels shape: torch.Size([1, 15906]) Final batch size: 1, sequence length: 34694 Attention mask shape: torch.Size([1, 1, 34694, 34694]) Position ids shape: torch.Size([1, 34694]) Input IDs shape: torch.Size([1, 34694]) Labels shape: torch.Size([1, 34694]) Final batch size: 1, sequence length: 31512 Final batch size: 1, sequence length: 26452 Attention mask shape: torch.Size([1, 1, 31512, 31512]) Attention mask shape: torch.Size([1, 1, 26452, 26452]) Position ids shape: torch.Size([1, 31512]) Position ids shape: torch.Size([1, 26452]) Input IDs shape: torch.Size([1, 31512]) Input IDs shape: torch.Size([1, 26452]) Labels shape: torch.Size([1, 31512]) Labels shape: torch.Size([1, 26452]) Final batch size: 1, sequence length: 24056 Attention mask shape: torch.Size([1, 1, 24056, 24056]) Position ids shape: torch.Size([1, 24056]) Input IDs shape: torch.Size([1, 24056]) Labels shape: torch.Size([1, 24056]) Final batch size: 1, sequence length: 17775 Attention mask shape: torch.Size([1, 1, 17775, 17775]) Position ids shape: torch.Size([1, 17775]) Input IDs shape: torch.Size([1, 17775]) Labels shape: torch.Size([1, 17775]) Final batch size: 1, sequence length: 8008 Attention mask shape: torch.Size([1, 1, 8008, 8008]) Position ids shape: torch.Size([1, 8008]) Input IDs shape: torch.Size([1, 8008]) Labels shape: torch.Size([1, 8008]) Final batch size: 1, sequence length: 16631 Attention mask shape: torch.Size([1, 1, 16631, 16631]) Position ids shape: torch.Size([1, 16631]) Input IDs shape: torch.Size([1, 16631]) Labels shape: torch.Size([1, 16631]) Final batch size: 1, sequence length: 29880 Attention mask shape: torch.Size([1, 1, 29880, 29880]) Position ids shape: torch.Size([1, 29880]) Input IDs shape: torch.Size([1, 29880]) Labels shape: torch.Size([1, 29880]) Final batch size: 1, sequence length: 27987 Attention mask shape: torch.Size([1, 1, 27987, 27987]) Position ids shape: torch.Size([1, 27987]) Input IDs shape: torch.Size([1, 27987]) Labels shape: torch.Size([1, 27987]) Final batch size: 1, sequence length: 23945 Attention mask shape: torch.Size([1, 1, 23945, 23945]) Position ids shape: torch.Size([1, 23945]) Input IDs shape: torch.Size([1, 23945]) Labels shape: torch.Size([1, 23945]) Final batch size: 1, sequence length: 35696 Attention mask shape: torch.Size([1, 1, 35696, 35696]) Position ids shape: torch.Size([1, 35696]) Input IDs shape: torch.Size([1, 35696]) Labels shape: torch.Size([1, 35696]) Final batch size: 1, sequence length: 35478 Attention mask shape: torch.Size([1, 1, 35478, 35478]) Position ids shape: torch.Size([1, 35478]) Input IDs shape: torch.Size([1, 35478]) Labels shape: torch.Size([1, 35478]) Final batch size: 1, sequence length: 34777 Attention mask shape: torch.Size([1, 1, 34777, 34777]) Position ids shape: torch.Size([1, 34777]) Input IDs shape: torch.Size([1, 34777]) Labels shape: torch.Size([1, 34777]) Final batch size: 1, sequence length: 7448 Attention mask shape: torch.Size([1, 1, 7448, 7448]) Position ids shape: torch.Size([1, 7448]) Input IDs shape: torch.Size([1, 7448]) Labels shape: torch.Size([1, 7448]) Final batch size: 1, sequence length: 16852 Attention mask shape: torch.Size([1, 1, 16852, 16852]) Position ids shape: torch.Size([1, 16852]) Input IDs shape: torch.Size([1, 16852]) Labels shape: torch.Size([1, 16852]) Final batch size: 1, sequence length: 29262 Attention mask shape: torch.Size([1, 1, 29262, 29262]) Position ids shape: torch.Size([1, 29262]) Input IDs shape: torch.Size([1, 29262]) Labels shape: torch.Size([1, 29262]) Final batch size: 1, sequence length: 28879 Attention mask shape: torch.Size([1, 1, 28879, 28879]) Position ids shape: torch.Size([1, 28879]) Input IDs shape: torch.Size([1, 28879]) Labels shape: torch.Size([1, 28879]) Final batch size: 1, sequence length: 31786 Attention mask shape: torch.Size([1, 1, 31786, 31786]) Position ids shape: torch.Size([1, 31786]) Input IDs shape: torch.Size([1, 31786]) Labels shape: torch.Size([1, 31786]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 28691 Attention mask shape: torch.Size([1, 1, 28691, 28691]) Position ids shape: torch.Size([1, 28691]) Input IDs shape: torch.Size([1, 28691]) Labels shape: torch.Size([1, 28691]) Final batch size: 1, sequence length: 36314 Attention mask shape: torch.Size([1, 1, 36314, 36314]) Position ids shape: torch.Size([1, 36314]) Input IDs shape: torch.Size([1, 36314]) Labels shape: torch.Size([1, 36314]) Final batch size: 1, sequence length: 24824 Attention mask shape: torch.Size([1, 1, 24824, 24824]) Position ids shape: torch.Size([1, 24824]) Input IDs shape: torch.Size([1, 24824]) Labels shape: torch.Size([1, 24824]) Final batch size: 1, sequence length: 35999 Attention mask shape: torch.Size([1, 1, 35999, 35999]) Position ids shape: torch.Size([1, 35999]) Input IDs shape: torch.Size([1, 35999]) Labels shape: torch.Size([1, 35999]) Final batch size: 1, sequence length: 37933 Attention mask shape: torch.Size([1, 1, 37933, 37933]) Position ids shape: torch.Size([1, 37933]) Input IDs shape: torch.Size([1, 37933]) Labels shape: torch.Size([1, 37933]) Final batch size: 1, sequence length: 39476 Attention mask shape: torch.Size([1, 1, 39476, 39476]) Position ids shape: torch.Size([1, 39476]) Input IDs shape: torch.Size([1, 39476]) Labels shape: torch.Size([1, 39476]) Final batch size: 1, sequence length: 10480 Attention mask shape: torch.Size([1, 1, 10480, 10480]) Position ids shape: torch.Size([1, 10480]) Input IDs shape: torch.Size([1, 10480]) Labels shape: torch.Size([1, 10480]) Final batch size: 1, sequence length: 20694 Attention mask shape: torch.Size([1, 1, 20694, 20694]) Position ids shape: torch.Size([1, 20694]) Input IDs shape: torch.Size([1, 20694]) Labels shape: torch.Size([1, 20694]) Final batch size: 1, sequence length: 35526 Attention mask shape: torch.Size([1, 1, 35526, 35526]) Position ids shape: torch.Size([1, 35526]) Input IDs shape: torch.Size([1, 35526]) Labels shape: torch.Size([1, 35526]) Final batch size: 1, sequence length: 31132 Attention mask shape: torch.Size([1, 1, 31132, 31132]) Position ids shape: torch.Size([1, 31132]) Input IDs shape: torch.Size([1, 31132]) Labels shape: torch.Size([1, 31132]) Final batch size: 1, sequence length: 6378 Attention mask shape: torch.Size([1, 1, 6378, 6378]) Position ids shape: torch.Size([1, 6378]) Input IDs shape: torch.Size([1, 6378]) Labels shape: torch.Size([1, 6378]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 38935 Attention mask shape: torch.Size([1, 1, 38935, 38935]) Position ids shape: torch.Size([1, 38935]) Input IDs shape: torch.Size([1, 38935]) Labels shape: torch.Size([1, 38935]) Final batch size: 1, sequence length: 28495 Attention mask shape: torch.Size([1, 1, 28495, 28495]) Position ids shape: torch.Size([1, 28495]) Input IDs shape: torch.Size([1, 28495]) Labels shape: torch.Size([1, 28495]) Final batch size: 1, sequence length: 11395 Attention mask shape: torch.Size([1, 1, 11395, 11395]) Position ids shape: torch.Size([1, 11395]) Input IDs shape: torch.Size([1, 11395]) Labels shape: torch.Size([1, 11395]) Final batch size: 1, sequence length: 20176 Attention mask shape: torch.Size([1, 1, 20176, 20176]) Position ids shape: torch.Size([1, 20176]) Input IDs shape: torch.Size([1, 20176]) Labels shape: torch.Size([1, 20176]) Final batch size: 1, sequence length: 14869 Attention mask shape: torch.Size([1, 1, 14869, 14869]) Position ids shape: torch.Size([1, 14869]) Input IDs shape: torch.Size([1, 14869]) Labels shape: torch.Size([1, 14869]) Final batch size: 1, sequence length: 35116 Attention mask shape: torch.Size([1, 1, 35116, 35116]) Position ids shape: torch.Size([1, 35116]) Input IDs shape: torch.Size([1, 35116]) Labels shape: torch.Size([1, 35116]) Final batch size: 1, sequence length: 18911 Attention mask shape: torch.Size([1, 1, 18911, 18911]) Position ids shape: torch.Size([1, 18911]) Input IDs shape: torch.Size([1, 18911]) Labels shape: torch.Size([1, 18911]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 37159 Attention mask shape: torch.Size([1, 1, 37159, 37159]) Position ids shape: torch.Size([1, 37159]) Input IDs shape: torch.Size([1, 37159]) Labels shape: torch.Size([1, 37159]) Final batch size: 1, sequence length: 18127 Attention mask shape: torch.Size([1, 1, 18127, 18127]) Position ids shape: torch.Size([1, 18127]) Input IDs shape: torch.Size([1, 18127]) Labels shape: torch.Size([1, 18127]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40605 Attention mask shape: torch.Size([1, 1, 40605, 40605]) Position ids shape: torch.Size([1, 40605]) Input IDs shape: torch.Size([1, 40605]) Labels shape: torch.Size([1, 40605]) Final batch size: 1, sequence length: 30024 Attention mask shape: torch.Size([1, 1, 30024, 30024]) Position ids shape: torch.Size([1, 30024]) Input IDs shape: torch.Size([1, 30024]) Labels shape: torch.Size([1, 30024]) Final batch size: 1, sequence length: 36469 Attention mask shape: torch.Size([1, 1, 36469, 36469]) Position ids shape: torch.Size([1, 36469]) Input IDs shape: torch.Size([1, 36469]) Labels shape: torch.Size([1, 36469]) Final batch size: 1, sequence length: 28313 Attention mask shape: torch.Size([1, 1, 28313, 28313]) Position ids shape: torch.Size([1, 28313]) Input IDs shape: torch.Size([1, 28313]) Labels shape: torch.Size([1, 28313]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 39687 Attention mask shape: torch.Size([1, 1, 39687, 39687]) Position ids shape: torch.Size([1, 39687]) Input IDs shape: torch.Size([1, 39687]) Labels shape: torch.Size([1, 39687]) Final batch size: 1, sequence length: 39324 Attention mask shape: torch.Size([1, 1, 39324, 39324]) Position ids shape: torch.Size([1, 39324]) Input IDs shape: torch.Size([1, 39324]) Labels shape: torch.Size([1, 39324]) Final batch size: 1, sequence length: 30245 Attention mask shape: torch.Size([1, 1, 30245, 30245]) Position ids shape: torch.Size([1, 30245]) Input IDs shape: torch.Size([1, 30245]) Labels shape: torch.Size([1, 30245]) Final batch size: 1, sequence length: 33070 Attention mask shape: torch.Size([1, 1, 33070, 33070]) Position ids shape: torch.Size([1, 33070]) Input IDs shape: torch.Size([1, 33070]) Labels shape: torch.Size([1, 33070]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 26402 Attention mask shape: torch.Size([1, 1, 26402, 26402]) Position ids shape: torch.Size([1, 26402]) Input IDs shape: torch.Size([1, 26402]) Labels shape: torch.Size([1, 26402]) {'loss': 0.2422, 'grad_norm': 0.28429630540389345, 'learning_rate': 2.5000000000000015e-06, 'num_tokens': -inf, 'epoch': 5.62} Final batch size: 1, sequence length: 3968 Attention mask shape: torch.Size([1, 1, 3968, 3968]) Position ids shape: torch.Size([1, 3968]) Input IDs shape: torch.Size([1, 3968]) Labels shape: torch.Size([1, 3968]) Final batch size: 1, sequence length: 8347 Attention mask shape: torch.Size([1, 1, 8347, 8347]) Position ids shape: torch.Size([1, 8347]) Input IDs shape: torch.Size([1, 8347]) Labels shape: torch.Size([1, 8347]) Final batch size: 1, sequence length: 5997 Attention mask shape: torch.Size([1, 1, 5997, 5997]) Position ids shape: torch.Size([1, 5997]) Input IDs shape: torch.Size([1, 5997]) Labels shape: torch.Size([1, 5997]) Final batch size: 1, sequence length: 9027 Attention mask shape: torch.Size([1, 1, 9027, 9027]) Position ids shape: torch.Size([1, 9027]) Input IDs shape: torch.Size([1, 9027]) Labels shape: torch.Size([1, 9027]) Final batch size: 1, sequence length: 6567 Attention mask shape: torch.Size([1, 1, 6567, 6567]) Position ids shape: torch.Size([1, 6567]) Input IDs shape: torch.Size([1, 6567]) Labels shape: torch.Size([1, 6567]) Final batch size: 1, sequence length: 11054 Attention mask shape: torch.Size([1, 1, 11054, 11054]) Position ids shape: torch.Size([1, 11054]) Input IDs shape: torch.Size([1, 11054]) Labels shape: torch.Size([1, 11054]) Final batch size: 1, sequence length: 12967 Attention mask shape: torch.Size([1, 1, 12967, 12967]) Position ids shape: torch.Size([1, 12967]) Input IDs shape: torch.Size([1, 12967]) Labels shape: torch.Size([1, 12967]) Final batch size: 1, sequence length: 11232 Attention mask shape: torch.Size([1, 1, 11232, 11232]) Position ids shape: torch.Size([1, 11232]) Input IDs shape: torch.Size([1, 11232]) Labels shape: torch.Size([1, 11232]) Final batch size: 1, sequence length: 14758 Attention mask shape: torch.Size([1, 1, 14758, 14758]) Position ids shape: torch.Size([1, 14758]) Input IDs shape: torch.Size([1, 14758]) Labels shape: torch.Size([1, 14758]) Final batch size: 1, sequence length: 13260 Attention mask shape: torch.Size([1, 1, 13260, 13260]) Position ids shape: torch.Size([1, 13260]) Input IDs shape: torch.Size([1, 13260]) Labels shape: torch.Size([1, 13260]) Final batch size: 1, sequence length: 11880 Attention mask shape: torch.Size([1, 1, 11880, 11880]) Position ids shape: torch.Size([1, 11880]) Input IDs shape: torch.Size([1, 11880]) Labels shape: torch.Size([1, 11880]) Final batch size: 1, sequence length: 11752 Attention mask shape: torch.Size([1, 1, 11752, 11752]) Position ids shape: torch.Size([1, 11752]) Input IDs shape: torch.Size([1, 11752]) Labels shape: torch.Size([1, 11752]) Final batch size: 1, sequence length: 12370 Attention mask shape: torch.Size([1, 1, 12370, 12370]) Position ids shape: torch.Size([1, 12370]) Input IDs shape: torch.Size([1, 12370]) Labels shape: torch.Size([1, 12370]) Final batch size: 1, sequence length: 14512 Attention mask shape: torch.Size([1, 1, 14512, 14512]) Position ids shape: torch.Size([1, 14512]) Input IDs shape: torch.Size([1, 14512]) Labels shape: torch.Size([1, 14512]) Final batch size: 1, sequence length: 14597 Attention mask shape: torch.Size([1, 1, 14597, 14597]) Position ids shape: torch.Size([1, 14597]) Input IDs shape: torch.Size([1, 14597]) Labels shape: torch.Size([1, 14597]) Final batch size: 1, sequence length: 16014 Attention mask shape: torch.Size([1, 1, 16014, 16014]) Position ids shape: torch.Size([1, 16014]) Input IDs shape: torch.Size([1, 16014]) Labels shape: torch.Size([1, 16014]) Final batch size: 1, sequence length: 17166 Attention mask shape: torch.Size([1, 1, 17166, 17166]) Position ids shape: torch.Size([1, 17166]) Input IDs shape: torch.Size([1, 17166]) Labels shape: torch.Size([1, 17166]) Final batch size: 1, sequence length: 15438 Attention mask shape: torch.Size([1, 1, 15438, 15438]) Position ids shape: torch.Size([1, 15438]) Input IDs shape: torch.Size([1, 15438]) Labels shape: torch.Size([1, 15438]) Final batch size: 1, sequence length: 18050 Attention mask shape: torch.Size([1, 1, 18050, 18050]) Position ids shape: torch.Size([1, 18050]) Input IDs shape: torch.Size([1, 18050]) Labels shape: torch.Size([1, 18050]) Final batch size: 1, sequence length: 17092 Attention mask shape: torch.Size([1, 1, 17092, 17092]) Position ids shape: torch.Size([1, 17092]) Input IDs shape: torch.Size([1, 17092]) Labels shape: torch.Size([1, 17092]) Final batch size: 1, sequence length: 18131 Attention mask shape: torch.Size([1, 1, 18131, 18131]) Position ids shape: torch.Size([1, 18131]) Input IDs shape: torch.Size([1, 18131]) Labels shape: torch.Size([1, 18131]) Final batch size: 1, sequence length: 17839 Attention mask shape: torch.Size([1, 1, 17839, 17839]) Position ids shape: torch.Size([1, 17839]) Input IDs shape: torch.Size([1, 17839]) Labels shape: torch.Size([1, 17839]) Final batch size: 1, sequence length: 18264 Attention mask shape: torch.Size([1, 1, 18264, 18264]) Position ids shape: torch.Size([1, 18264]) Input IDs shape: torch.Size([1, 18264]) Labels shape: torch.Size([1, 18264]) Final batch size: 1, sequence length: 11206 Attention mask shape: torch.Size([1, 1, 11206, 11206]) Position ids shape: torch.Size([1, 11206]) Input IDs shape: torch.Size([1, 11206]) Labels shape: torch.Size([1, 11206]) Final batch size: 1, sequence length: 20487 Attention mask shape: torch.Size([1, 1, 20487, 20487]) Position ids shape: torch.Size([1, 20487]) Input IDs shape: torch.Size([1, 20487]) Labels shape: torch.Size([1, 20487]) Final batch size: 1, sequence length: 14520 Attention mask shape: torch.Size([1, 1, 14520, 14520]) Position ids shape: torch.Size([1, 14520]) Input IDs shape: torch.Size([1, 14520]) Labels shape: torch.Size([1, 14520]) Final batch size: 1, sequence length: 20947 Attention mask shape: torch.Size([1, 1, 20947, 20947]) Position ids shape: torch.Size([1, 20947]) Input IDs shape: torch.Size([1, 20947]) Labels shape: torch.Size([1, 20947]) Final batch size: 1, sequence length: 14335 Attention mask shape: torch.Size([1, 1, 14335, 14335]) Position ids shape: torch.Size([1, 14335]) Input IDs shape: torch.Size([1, 14335]) Labels shape: torch.Size([1, 14335]) Final batch size: 1, sequence length: 22107 Attention mask shape: torch.Size([1, 1, 22107, 22107]) Position ids shape: torch.Size([1, 22107]) Input IDs shape: torch.Size([1, 22107]) Labels shape: torch.Size([1, 22107]) Final batch size: 1, sequence length: 13061 Attention mask shape: torch.Size([1, 1, 13061, 13061]) Position ids shape: torch.Size([1, 13061]) Input IDs shape: torch.Size([1, 13061]) Labels shape: torch.Size([1, 13061]) Final batch size: 1, sequence length: 18950 Attention mask shape: torch.Size([1, 1, 18950, 18950]) Position ids shape: torch.Size([1, 18950]) Input IDs shape: torch.Size([1, 18950]) Labels shape: torch.Size([1, 18950]) Final batch size: 1, sequence length: 24566 Attention mask shape: torch.Size([1, 1, 24566, 24566]) Position ids shape: torch.Size([1, 24566]) Input IDs shape: torch.Size([1, 24566]) Labels shape: torch.Size([1, 24566]) Final batch size: 1, sequence length: 17194 Attention mask shape: torch.Size([1, 1, 17194, 17194]) Position ids shape: torch.Size([1, 17194]) Input IDs shape: torch.Size([1, 17194]) Labels shape: torch.Size([1, 17194]) Final batch size: 1, sequence length: 21137 Attention mask shape: torch.Size([1, 1, 21137, 21137]) Position ids shape: torch.Size([1, 21137]) Input IDs shape: torch.Size([1, 21137]) Labels shape: torch.Size([1, 21137]) Final batch size: 1, sequence length: 23324 Attention mask shape: torch.Size([1, 1, 23324, 23324]) Position ids shape: torch.Size([1, 23324]) Input IDs shape: torch.Size([1, 23324]) Labels shape: torch.Size([1, 23324]) Final batch size: 1, sequence length: 17115 Attention mask shape: torch.Size([1, 1, 17115, 17115]) Position ids shape: torch.Size([1, 17115]) Input IDs shape: torch.Size([1, 17115]) Labels shape: torch.Size([1, 17115]) Final batch size: 1, sequence length: 26068 Attention mask shape: torch.Size([1, 1, 26068, 26068]) Position ids shape: torch.Size([1, 26068]) Input IDs shape: torch.Size([1, 26068]) Labels shape: torch.Size([1, 26068]) Final batch size: 1, sequence length: 22547 Attention mask shape: torch.Size([1, 1, 22547, 22547]) Position ids shape: torch.Size([1, 22547]) Input IDs shape: torch.Size([1, 22547]) Labels shape: torch.Size([1, 22547]) Final batch size: 1, sequence length: 24432 Attention mask shape: torch.Size([1, 1, 24432, 24432]) Position ids shape: torch.Size([1, 24432]) Input IDs shape: torch.Size([1, 24432]) Labels shape: torch.Size([1, 24432]) Final batch size: 1, sequence length: 15229 Attention mask shape: torch.Size([1, 1, 15229, 15229]) Position ids shape: torch.Size([1, 15229]) Input IDs shape: torch.Size([1, 15229]) Labels shape: torch.Size([1, 15229]) Final batch size: 1, sequence length: 23960 Attention mask shape: torch.Size([1, 1, 23960, 23960]) Position ids shape: torch.Size([1, 23960]) Input IDs shape: torch.Size([1, 23960]) Labels shape: torch.Size([1, 23960]) Final batch size: 1, sequence length: 10286 Attention mask shape: torch.Size([1, 1, 10286, 10286]) Position ids shape: torch.Size([1, 10286]) Input IDs shape: torch.Size([1, 10286]) Labels shape: torch.Size([1, 10286]) Final batch size: 1, sequence length: 28284 Attention mask shape: torch.Size([1, 1, 28284, 28284]) Position ids shape: torch.Size([1, 28284]) Input IDs shape: torch.Size([1, 28284]) Labels shape: torch.Size([1, 28284]) Final batch size: 1, sequence length: 18051 Attention mask shape: torch.Size([1, 1, 18051, 18051]) Position ids shape: torch.Size([1, 18051]) Input IDs shape: torch.Size([1, 18051]) Labels shape: torch.Size([1, 18051]) Final batch size: 1, sequence length: 29236 Attention mask shape: torch.Size([1, 1, 29236, 29236]) Position ids shape: torch.Size([1, 29236]) Input IDs shape: torch.Size([1, 29236]) Labels shape: torch.Size([1, 29236]) Final batch size: 1, sequence length: 28858 Attention mask shape: torch.Size([1, 1, 28858, 28858]) Position ids shape: torch.Size([1, 28858]) Input IDs shape: torch.Size([1, 28858]) Labels shape: torch.Size([1, 28858]) Final batch size: 1, sequence length: 26271 Attention mask shape: torch.Size([1, 1, 26271, 26271]) Position ids shape: torch.Size([1, 26271]) Input IDs shape: torch.Size([1, 26271]) Labels shape: torch.Size([1, 26271]) Final batch size: 1, sequence length: 26333 Attention mask shape: torch.Size([1, 1, 26333, 26333]) Position ids shape: torch.Size([1, 26333]) Input IDs shape: torch.Size([1, 26333]) Labels shape: torch.Size([1, 26333]) Final batch size: 1, sequence length: 21728 Attention mask shape: torch.Size([1, 1, 21728, 21728]) Position ids shape: torch.Size([1, 21728]) Input IDs shape: torch.Size([1, 21728]) Labels shape: torch.Size([1, 21728]) Final batch size: 1, sequence length: 29592 Attention mask shape: torch.Size([1, 1, 29592, 29592]) Position ids shape: torch.Size([1, 29592]) Input IDs shape: torch.Size([1, 29592]) Labels shape: torch.Size([1, 29592]) Final batch size: 1, sequence length: 29478 Attention mask shape: torch.Size([1, 1, 29478, 29478]) Position ids shape: torch.Size([1, 29478]) Input IDs shape: torch.Size([1, 29478]) Labels shape: torch.Size([1, 29478]) Final batch size: 1, sequence length: 26937 Attention mask shape: torch.Size([1, 1, 26937, 26937]) Position ids shape: torch.Size([1, 26937]) Input IDs shape: torch.Size([1, 26937]) Labels shape: torch.Size([1, 26937]) Final batch size: 1, sequence length: 31377 Attention mask shape: torch.Size([1, 1, 31377, 31377]) Position ids shape: torch.Size([1, 31377]) Input IDs shape: torch.Size([1, 31377]) Labels shape: torch.Size([1, 31377]) Final batch size: 1, sequence length: 25252 Attention mask shape: torch.Size([1, 1, 25252, 25252]) Position ids shape: torch.Size([1, 25252]) Input IDs shape: torch.Size([1, 25252]) Labels shape: torch.Size([1, 25252]) Final batch size: 1, sequence length: 28421 Attention mask shape: torch.Size([1, 1, 28421, 28421]) Position ids shape: torch.Size([1, 28421]) Input IDs shape: torch.Size([1, 28421]) Labels shape: torch.Size([1, 28421]) Final batch size: 1, sequence length: 29768 Attention mask shape: torch.Size([1, 1, 29768, 29768]) Position ids shape: torch.Size([1, 29768]) Input IDs shape: torch.Size([1, 29768]) Labels shape: torch.Size([1, 29768]) Final batch size: 1, sequence length: 27293 Attention mask shape: torch.Size([1, 1, 27293, 27293]) Position ids shape: torch.Size([1, 27293]) Input IDs shape: torch.Size([1, 27293]) Labels shape: torch.Size([1, 27293]) Final batch size: 1, sequence length: 30862 Attention mask shape: torch.Size([1, 1, 30862, 30862]) Position ids shape: torch.Size([1, 30862]) Input IDs shape: torch.Size([1, 30862]) Labels shape: torch.Size([1, 30862]) Final batch size: 1, sequence length: 31860 Attention mask shape: torch.Size([1, 1, 31860, 31860]) Position ids shape: torch.Size([1, 31860]) Input IDs shape: torch.Size([1, 31860]) Labels shape: torch.Size([1, 31860]) Final batch size: 1, sequence length: 18408 Attention mask shape: torch.Size([1, 1, 18408, 18408]) Position ids shape: torch.Size([1, 18408]) Input IDs shape: torch.Size([1, 18408]) Labels shape: torch.Size([1, 18408]) Final batch size: 1, sequence length: 32392 Attention mask shape: torch.Size([1, 1, 32392, 32392]) Position ids shape: torch.Size([1, 32392]) Input IDs shape: torch.Size([1, 32392]) Labels shape: torch.Size([1, 32392]) Final batch size: 1, sequence length: 27278 Attention mask shape: torch.Size([1, 1, 27278, 27278]) Position ids shape: torch.Size([1, 27278]) Input IDs shape: torch.Size([1, 27278]) Labels shape: torch.Size([1, 27278]) Final batch size: 1, sequence length: 7364 Attention mask shape: torch.Size([1, 1, 7364, 7364]) Position ids shape: torch.Size([1, 7364]) Input IDs shape: torch.Size([1, 7364]) Labels shape: torch.Size([1, 7364]) Final batch size: 1, sequence length: 33736 Attention mask shape: torch.Size([1, 1, 33736, 33736]) Position ids shape: torch.Size([1, 33736]) Input IDs shape: torch.Size([1, 33736]) Labels shape: torch.Size([1, 33736]) Final batch size: 1, sequence length: 19888 Attention mask shape: torch.Size([1, 1, 19888, 19888]) Position ids shape: torch.Size([1, 19888]) Input IDs shape: torch.Size([1, 19888]) Labels shape: torch.Size([1, 19888]) Final batch size: 1, sequence length: 13257 Attention mask shape: torch.Size([1, 1, 13257, 13257]) Position ids shape: torch.Size([1, 13257]) Input IDs shape: torch.Size([1, 13257]) Labels shape: torch.Size([1, 13257]) Final batch size: 1, sequence length: 31740 Attention mask shape: torch.Size([1, 1, 31740, 31740]) Position ids shape: torch.Size([1, 31740]) Input IDs shape: torch.Size([1, 31740]) Labels shape: torch.Size([1, 31740]) Final batch size: 1, sequence length: 30302 Attention mask shape: torch.Size([1, 1, 30302, 30302]) Position ids shape: torch.Size([1, 30302]) Input IDs shape: torch.Size([1, 30302]) Labels shape: torch.Size([1, 30302]) Final batch size: 1, sequence length: 32112 Attention mask shape: torch.Size([1, 1, 32112, 32112]) Position ids shape: torch.Size([1, 32112]) Input IDs shape: torch.Size([1, 32112]) Labels shape: torch.Size([1, 32112]) Final batch size: 1, sequence length: 33097 Attention mask shape: torch.Size([1, 1, 33097, 33097]) Position ids shape: torch.Size([1, 33097]) Input IDs shape: torch.Size([1, 33097]) Labels shape: torch.Size([1, 33097]) Final batch size: 1, sequence length: 9974 Attention mask shape: torch.Size([1, 1, 9974, 9974]) Position ids shape: torch.Size([1, 9974]) Input IDs shape: torch.Size([1, 9974]) Labels shape: torch.Size([1, 9974]) Final batch size: 1, sequence length: 32181 Attention mask shape: torch.Size([1, 1, 32181, 32181]) Position ids shape: torch.Size([1, 32181]) Input IDs shape: torch.Size([1, 32181]) Labels shape: torch.Size([1, 32181]) Final batch size: 1, sequence length: 35596 Attention mask shape: torch.Size([1, 1, 35596, 35596]) Position ids shape: torch.Size([1, 35596]) Input IDs shape: torch.Size([1, 35596]) Labels shape: torch.Size([1, 35596]) Final batch size: 1, sequence length: 29489 Attention mask shape: torch.Size([1, 1, 29489, 29489]) Position ids shape: torch.Size([1, 29489]) Input IDs shape: torch.Size([1, 29489]) Labels shape: torch.Size([1, 29489]) Final batch size: 1, sequence length: 32162 Attention mask shape: torch.Size([1, 1, 32162, 32162]) Position ids shape: torch.Size([1, 32162]) Input IDs shape: torch.Size([1, 32162]) Labels shape: torch.Size([1, 32162]) Final batch size: 1, sequence length: 25484 Attention mask shape: torch.Size([1, 1, 25484, 25484]) Position ids shape: torch.Size([1, 25484]) Input IDs shape: torch.Size([1, 25484]) Labels shape: torch.Size([1, 25484]) Final batch size: 1, sequence length: 32979 Attention mask shape: torch.Size([1, 1, 32979, 32979]) Position ids shape: torch.Size([1, 32979]) Input IDs shape: torch.Size([1, 32979]) Labels shape: torch.Size([1, 32979]) Final batch size: 1, sequence length: 37623 Attention mask shape: torch.Size([1, 1, 37623, 37623]) Position ids shape: torch.Size([1, 37623]) Input IDs shape: torch.Size([1, 37623]) Labels shape: torch.Size([1, 37623]) Final batch size: 1, sequence length: 32327 Attention mask shape: torch.Size([1, 1, 32327, 32327]) Position ids shape: torch.Size([1, 32327]) Input IDs shape: torch.Size([1, 32327]) Labels shape: torch.Size([1, 32327]) Final batch size: 1, sequence length: 35069 Attention mask shape: torch.Size([1, 1, 35069, 35069]) Position ids shape: torch.Size([1, 35069]) Input IDs shape: torch.Size([1, 35069]) Labels shape: torch.Size([1, 35069]) Final batch size: 1, sequence length: 35171 Attention mask shape: torch.Size([1, 1, 35171, 35171]) Position ids shape: torch.Size([1, 35171]) Input IDs shape: torch.Size([1, 35171]) Labels shape: torch.Size([1, 35171]) Final batch size: 1, sequence length: 34514 Attention mask shape: torch.Size([1, 1, 34514, 34514]) Position ids shape: torch.Size([1, 34514]) Input IDs shape: torch.Size([1, 34514]) Labels shape: torch.Size([1, 34514]) Final batch size: 1, sequence length: 34871 Attention mask shape: torch.Size([1, 1, 34871, 34871]) Position ids shape: torch.Size([1, 34871]) Input IDs shape: torch.Size([1, 34871]) Labels shape: torch.Size([1, 34871]) Final batch size: 1, sequence length: 21500 Attention mask shape: torch.Size([1, 1, 21500, 21500]) Position ids shape: torch.Size([1, 21500]) Input IDs shape: torch.Size([1, 21500]) Labels shape: torch.Size([1, 21500]) Final batch size: 1, sequence length: 38790 Attention mask shape: torch.Size([1, 1, 38790, 38790]) Position ids shape: torch.Size([1, 38790]) Input IDs shape: torch.Size([1, 38790]) Labels shape: torch.Size([1, 38790]) Final batch size: 1, sequence length: 30051 Attention mask shape: torch.Size([1, 1, 30051, 30051]) Position ids shape: torch.Size([1, 30051]) Input IDs shape: torch.Size([1, 30051]) Labels shape: torch.Size([1, 30051]) Final batch size: 1, sequence length: 24061 Attention mask shape: torch.Size([1, 1, 24061, 24061]) Position ids shape: torch.Size([1, 24061]) Input IDs shape: torch.Size([1, 24061]) Labels shape: torch.Size([1, 24061]) Final batch size: 1, sequence length: 39754 Attention mask shape: torch.Size([1, 1, 39754, 39754]) Position ids shape: torch.Size([1, 39754]) Input IDs shape: torch.Size([1, 39754]) Labels shape: torch.Size([1, 39754]) Final batch size: 1, sequence length: 21408 Attention mask shape: torch.Size([1, 1, 21408, 21408]) Position ids shape: torch.Size([1, 21408]) Input IDs shape: torch.Size([1, 21408]) Labels shape: torch.Size([1, 21408]) Final batch size: 1, sequence length: 25386 Attention mask shape: torch.Size([1, 1, 25386, 25386]) Position ids shape: torch.Size([1, 25386]) Input IDs shape: torch.Size([1, 25386]) Labels shape: torch.Size([1, 25386]) Final batch size: 1, sequence length: 17777 Attention mask shape: torch.Size([1, 1, 17777, 17777]) Position ids shape: torch.Size([1, 17777]) Input IDs shape: torch.Size([1, 17777]) Labels shape: torch.Size([1, 17777]) Final batch size: 1, sequence length: 13557 Attention mask shape: torch.Size([1, 1, 13557, 13557]) Position ids shape: torch.Size([1, 13557]) Input IDs shape: torch.Size([1, 13557]) Labels shape: torch.Size([1, 13557]) Final batch size: 1, sequence length: 38351 Attention mask shape: torch.Size([1, 1, 38351, 38351]) Position ids shape: torch.Size([1, 38351]) Input IDs shape: torch.Size([1, 38351]) Labels shape: torch.Size([1, 38351]) Final batch size: 1, sequence length: 30639 Attention mask shape: torch.Size([1, 1, 30639, 30639]) Position ids shape: torch.Size([1, 30639]) Input IDs shape: torch.Size([1, 30639]) Labels shape: torch.Size([1, 30639]) Final batch size: 1, sequence length: 31679 Attention mask shape: torch.Size([1, 1, 31679, 31679]) Position ids shape: torch.Size([1, 31679]) Input IDs shape: torch.Size([1, 31679]) Labels shape: torch.Size([1, 31679]) Final batch size: 1, sequence length: 36535 Attention mask shape: torch.Size([1, 1, 36535, 36535]) Position ids shape: torch.Size([1, 36535]) Input IDs shape: torch.Size([1, 36535]) Labels shape: torch.Size([1, 36535]) Final batch size: 1, sequence length: 34170 Attention mask shape: torch.Size([1, 1, 34170, 34170]) Position ids shape: torch.Size([1, 34170]) Input IDs shape: torch.Size([1, 34170]) Labels shape: torch.Size([1, 34170]) Final batch size: 1, sequence length: 18711 Attention mask shape: torch.Size([1, 1, 18711, 18711]) Position ids shape: torch.Size([1, 18711]) Input IDs shape: torch.Size([1, 18711]) Labels shape: torch.Size([1, 18711]) Final batch size: 1, sequence length: 24943 Attention mask shape: torch.Size([1, 1, 24943, 24943]) Position ids shape: torch.Size([1, 24943]) Input IDs shape: torch.Size([1, 24943]) Labels shape: torch.Size([1, 24943]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 15697 Attention mask shape: torch.Size([1, 1, 15697, 15697]) Position ids shape: torch.Size([1, 15697]) Input IDs shape: torch.Size([1, 15697]) Labels shape: torch.Size([1, 15697]) Final batch size: 1, sequence length: 21825 Attention mask shape: torch.Size([1, 1, 21825, 21825]) Position ids shape: torch.Size([1, 21825]) Input IDs shape: torch.Size([1, 21825]) Labels shape: torch.Size([1, 21825]) Final batch size: 1, sequence length: 33807 Attention mask shape: torch.Size([1, 1, 33807, 33807]) Position ids shape: torch.Size([1, 33807]) Input IDs shape: torch.Size([1, 33807]) Labels shape: torch.Size([1, 33807]) Final batch size: 1, sequence length: 24910 Attention mask shape: torch.Size([1, 1, 24910, 24910]) Position ids shape: torch.Size([1, 24910]) Input IDs shape: torch.Size([1, 24910]) Labels shape: torch.Size([1, 24910]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 24840 Attention mask shape: torch.Size([1, 1, 24840, 24840]) Position ids shape: torch.Size([1, 24840]) Input IDs shape: torch.Size([1, 24840]) Labels shape: torch.Size([1, 24840]) Final batch size: 1, sequence length: 18106 Attention mask shape: torch.Size([1, 1, 18106, 18106]) Position ids shape: torch.Size([1, 18106]) Input IDs shape: torch.Size([1, 18106]) Labels shape: torch.Size([1, 18106]) Final batch size: 1, sequence length: 31513 Attention mask shape: torch.Size([1, 1, 31513, 31513]) Position ids shape: torch.Size([1, 31513]) Input IDs shape: torch.Size([1, 31513]) Labels shape: torch.Size([1, 31513]) Final batch size: 1, sequence length: 33883 Attention mask shape: torch.Size([1, 1, 33883, 33883]) Position ids shape: torch.Size([1, 33883]) Input IDs shape: torch.Size([1, 33883]) Labels shape: torch.Size([1, 33883]) Final batch size: 1, sequence length: 23243 Attention mask shape: torch.Size([1, 1, 23243, 23243]) Position ids shape: torch.Size([1, 23243]) Input IDs shape: torch.Size([1, 23243]) Labels shape: torch.Size([1, 23243]) Final batch size: 1, sequence length: 30361 Attention mask shape: torch.Size([1, 1, 30361, 30361]) Position ids shape: torch.Size([1, 30361]) Input IDs shape: torch.Size([1, 30361]) Labels shape: torch.Size([1, 30361]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 27228 Attention mask shape: torch.Size([1, 1, 27228, 27228]) Position ids shape: torch.Size([1, 27228]) Input IDs shape: torch.Size([1, 27228]) Labels shape: torch.Size([1, 27228]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 15180 Attention mask shape: torch.Size([1, 1, 15180, 15180]) Position ids shape: torch.Size([1, 15180]) Input IDs shape: torch.Size([1, 15180]) Labels shape: torch.Size([1, 15180]) Final batch size: 1, sequence length: 30419 Attention mask shape: torch.Size([1, 1, 30419, 30419]) Position ids shape: torch.Size([1, 30419]) Input IDs shape: torch.Size([1, 30419]) Labels shape: torch.Size([1, 30419]) Final batch size: 1, sequence length: 29691 Attention mask shape: torch.Size([1, 1, 29691, 29691]) Position ids shape: torch.Size([1, 29691]) Input IDs shape: torch.Size([1, 29691]) Labels shape: torch.Size([1, 29691]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 32351 Attention mask shape: torch.Size([1, 1, 32351, 32351]) Position ids shape: torch.Size([1, 32351]) Input IDs shape: torch.Size([1, 32351]) Labels shape: torch.Size([1, 32351]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 39483 Attention mask shape: torch.Size([1, 1, 39483, 39483]) Position ids shape: torch.Size([1, 39483]) Input IDs shape: torch.Size([1, 39483]) Labels shape: torch.Size([1, 39483]) Final batch size: 1, sequence length: 33549 Attention mask shape: torch.Size([1, 1, 33549, 33549]) Position ids shape: torch.Size([1, 33549]) Input IDs shape: torch.Size([1, 33549]) Labels shape: torch.Size([1, 33549]) Final batch size: 1, sequence length: 13064 Attention mask shape: torch.Size([1, 1, 13064, 13064]) Position ids shape: torch.Size([1, 13064]) Input IDs shape: torch.Size([1, 13064]) Labels shape: torch.Size([1, 13064]) Final batch size: 1, sequence length: 36532 Attention mask shape: torch.Size([1, 1, 36532, 36532]) Position ids shape: torch.Size([1, 36532]) Input IDs shape: torch.Size([1, 36532]) Labels shape: torch.Size([1, 36532]) {'loss': 0.2544, 'grad_norm': 0.2380081956314407, 'learning_rate': 2.2768048249248648e-06, 'num_tokens': -inf, 'epoch': 5.75} Final batch size: 1, sequence length: 8845 Attention mask shape: torch.Size([1, 1, 8845, 8845]) Position ids shape: torch.Size([1, 8845]) Input IDs shape: torch.Size([1, 8845]) Labels shape: torch.Size([1, 8845]) Final batch size: 1, sequence length: 7235 Attention mask shape: torch.Size([1, 1, 7235, 7235]) Position ids shape: torch.Size([1, 7235]) Input IDs shape: torch.Size([1, 7235]) Labels shape: torch.Size([1, 7235]) Final batch size: 1, sequence length: 12215 Attention mask shape: torch.Size([1, 1, 12215, 12215]) Position ids shape: torch.Size([1, 12215]) Input IDs shape: torch.Size([1, 12215]) Labels shape: torch.Size([1, 12215]) Final batch size: 1, sequence length: 12830 Attention mask shape: torch.Size([1, 1, 12830, 12830]) Position ids shape: torch.Size([1, 12830]) Input IDs shape: torch.Size([1, 12830]) Labels shape: torch.Size([1, 12830]) Final batch size: 1, sequence length: 13665 Attention mask shape: torch.Size([1, 1, 13665, 13665]) Position ids shape: torch.Size([1, 13665]) Input IDs shape: torch.Size([1, 13665]) Labels shape: torch.Size([1, 13665]) Final batch size: 1, sequence length: 13575 Attention mask shape: torch.Size([1, 1, 13575, 13575]) Position ids shape: torch.Size([1, 13575]) Input IDs shape: torch.Size([1, 13575]) Labels shape: torch.Size([1, 13575]) Final batch size: 1, sequence length: 12562 Attention mask shape: torch.Size([1, 1, 12562, 12562]) Position ids shape: torch.Size([1, 12562]) Input IDs shape: torch.Size([1, 12562]) Labels shape: torch.Size([1, 12562]) Final batch size: 1, sequence length: 12960 Attention mask shape: torch.Size([1, 1, 12960, 12960]) Position ids shape: torch.Size([1, 12960]) Input IDs shape: torch.Size([1, 12960]) Labels shape: torch.Size([1, 12960]) Final batch size: 1, sequence length: 14689 Attention mask shape: torch.Size([1, 1, 14689, 14689]) Position ids shape: torch.Size([1, 14689]) Input IDs shape: torch.Size([1, 14689]) Labels shape: torch.Size([1, 14689]) Final batch size: 1, sequence length: 15673 Attention mask shape: torch.Size([1, 1, 15673, 15673]) Position ids shape: torch.Size([1, 15673]) Input IDs shape: torch.Size([1, 15673]) Labels shape: torch.Size([1, 15673]) Final batch size: 1, sequence length: 17016 Attention mask shape: torch.Size([1, 1, 17016, 17016]) Position ids shape: torch.Size([1, 17016]) Input IDs shape: torch.Size([1, 17016]) Labels shape: torch.Size([1, 17016]) Final batch size: 1, sequence length: 14833 Attention mask shape: torch.Size([1, 1, 14833, 14833]) Position ids shape: torch.Size([1, 14833]) Input IDs shape: torch.Size([1, 14833]) Labels shape: torch.Size([1, 14833]) Final batch size: 1, sequence length: 16145 Attention mask shape: torch.Size([1, 1, 16145, 16145]) Position ids shape: torch.Size([1, 16145]) Input IDs shape: torch.Size([1, 16145]) Labels shape: torch.Size([1, 16145]) Final batch size: 1, sequence length: 17026 Attention mask shape: torch.Size([1, 1, 17026, 17026]) Position ids shape: torch.Size([1, 17026]) Input IDs shape: torch.Size([1, 17026]) Labels shape: torch.Size([1, 17026]) Final batch size: 1, sequence length: 14482 Attention mask shape: torch.Size([1, 1, 14482, 14482]) Position ids shape: torch.Size([1, 14482]) Input IDs shape: torch.Size([1, 14482]) Labels shape: torch.Size([1, 14482]) Final batch size: 1, sequence length: 12622 Attention mask shape: torch.Size([1, 1, 12622, 12622]) Position ids shape: torch.Size([1, 12622]) Input IDs shape: torch.Size([1, 12622]) Labels shape: torch.Size([1, 12622]) Final batch size: 1, sequence length: 15816 Attention mask shape: torch.Size([1, 1, 15816, 15816]) Position ids shape: torch.Size([1, 15816]) Input IDs shape: torch.Size([1, 15816]) Labels shape: torch.Size([1, 15816]) Final batch size: 1, sequence length: 18978 Attention mask shape: torch.Size([1, 1, 18978, 18978]) Position ids shape: torch.Size([1, 18978]) Input IDs shape: torch.Size([1, 18978]) Labels shape: torch.Size([1, 18978]) Final batch size: 1, sequence length: 17988 Attention mask shape: torch.Size([1, 1, 17988, 17988]) Position ids shape: torch.Size([1, 17988]) Input IDs shape: torch.Size([1, 17988]) Labels shape: torch.Size([1, 17988]) Final batch size: 1, sequence length: 17512 Attention mask shape: torch.Size([1, 1, 17512, 17512]) Position ids shape: torch.Size([1, 17512]) Input IDs shape: torch.Size([1, 17512]) Labels shape: torch.Size([1, 17512]) Final batch size: 1, sequence length: 19409 Attention mask shape: torch.Size([1, 1, 19409, 19409]) Position ids shape: torch.Size([1, 19409]) Input IDs shape: torch.Size([1, 19409]) Labels shape: torch.Size([1, 19409]) Final batch size: 1, sequence length: 19225 Attention mask shape: torch.Size([1, 1, 19225, 19225]) Position ids shape: torch.Size([1, 19225]) Input IDs shape: torch.Size([1, 19225]) Labels shape: torch.Size([1, 19225]) Final batch size: 1, sequence length: 21075 Attention mask shape: torch.Size([1, 1, 21075, 21075]) Position ids shape: torch.Size([1, 21075]) Input IDs shape: torch.Size([1, 21075]) Labels shape: torch.Size([1, 21075]) Final batch size: 1, sequence length: 10494 Attention mask shape: torch.Size([1, 1, 10494, 10494]) Position ids shape: torch.Size([1, 10494]) Input IDs shape: torch.Size([1, 10494]) Labels shape: torch.Size([1, 10494]) Final batch size: 1, sequence length: 14577 Attention mask shape: torch.Size([1, 1, 14577, 14577]) Position ids shape: torch.Size([1, 14577]) Input IDs shape: torch.Size([1, 14577]) Labels shape: torch.Size([1, 14577]) Final batch size: 1, sequence length: 15222 Attention mask shape: torch.Size([1, 1, 15222, 15222]) Position ids shape: torch.Size([1, 15222]) Input IDs shape: torch.Size([1, 15222]) Labels shape: torch.Size([1, 15222]) Final batch size: 1, sequence length: 12969 Attention mask shape: torch.Size([1, 1, 12969, 12969]) Position ids shape: torch.Size([1, 12969]) Input IDs shape: torch.Size([1, 12969]) Labels shape: torch.Size([1, 12969]) Final batch size: 1, sequence length: 20784 Attention mask shape: torch.Size([1, 1, 20784, 20784]) Position ids shape: torch.Size([1, 20784]) Input IDs shape: torch.Size([1, 20784]) Labels shape: torch.Size([1, 20784]) Final batch size: 1, sequence length: 20585 Attention mask shape: torch.Size([1, 1, 20585, 20585]) Position ids shape: torch.Size([1, 20585]) Input IDs shape: torch.Size([1, 20585]) Labels shape: torch.Size([1, 20585]) Final batch size: 1, sequence length: 22949 Attention mask shape: torch.Size([1, 1, 22949, 22949]) Position ids shape: torch.Size([1, 22949]) Input IDs shape: torch.Size([1, 22949]) Labels shape: torch.Size([1, 22949]) Final batch size: 1, sequence length: 18922 Attention mask shape: torch.Size([1, 1, 18922, 18922]) Position ids shape: torch.Size([1, 18922]) Input IDs shape: torch.Size([1, 18922]) Labels shape: torch.Size([1, 18922]) Final batch size: 1, sequence length: 6618 Attention mask shape: torch.Size([1, 1, 6618, 6618]) Position ids shape: torch.Size([1, 6618]) Input IDs shape: torch.Size([1, 6618]) Labels shape: torch.Size([1, 6618]) Final batch size: 1, sequence length: 21598 Attention mask shape: torch.Size([1, 1, 21598, 21598]) Position ids shape: torch.Size([1, 21598]) Input IDs shape: torch.Size([1, 21598]) Labels shape: torch.Size([1, 21598]) Final batch size: 1, sequence length: 14548 Attention mask shape: torch.Size([1, 1, 14548, 14548]) Position ids shape: torch.Size([1, 14548]) Input IDs shape: torch.Size([1, 14548]) Labels shape: torch.Size([1, 14548]) Final batch size: 1, sequence length: 20991 Attention mask shape: torch.Size([1, 1, 20991, 20991]) Position ids shape: torch.Size([1, 20991]) Input IDs shape: torch.Size([1, 20991]) Labels shape: torch.Size([1, 20991]) Final batch size: 1, sequence length: 23755 Attention mask shape: torch.Size([1, 1, 23755, 23755]) Position ids shape: torch.Size([1, 23755]) Input IDs shape: torch.Size([1, 23755]) Labels shape: torch.Size([1, 23755]) Final batch size: 1, sequence length: 20465 Attention mask shape: torch.Size([1, 1, 20465, 20465]) Position ids shape: torch.Size([1, 20465]) Input IDs shape: torch.Size([1, 20465]) Labels shape: torch.Size([1, 20465]) Final batch size: 1, sequence length: 20389 Attention mask shape: torch.Size([1, 1, 20389, 20389]) Position ids shape: torch.Size([1, 20389]) Input IDs shape: torch.Size([1, 20389]) Labels shape: torch.Size([1, 20389]) Final batch size: 1, sequence length: 23293 Attention mask shape: torch.Size([1, 1, 23293, 23293]) Position ids shape: torch.Size([1, 23293]) Input IDs shape: torch.Size([1, 23293]) Labels shape: torch.Size([1, 23293]) Final batch size: 1, sequence length: 20630 Attention mask shape: torch.Size([1, 1, 20630, 20630]) Position ids shape: torch.Size([1, 20630]) Input IDs shape: torch.Size([1, 20630]) Labels shape: torch.Size([1, 20630]) Final batch size: 1, sequence length: 23805 Attention mask shape: torch.Size([1, 1, 23805, 23805]) Position ids shape: torch.Size([1, 23805]) Input IDs shape: torch.Size([1, 23805]) Labels shape: torch.Size([1, 23805]) Final batch size: 1, sequence length: 13072 Attention mask shape: torch.Size([1, 1, 13072, 13072]) Position ids shape: torch.Size([1, 13072]) Input IDs shape: torch.Size([1, 13072]) Labels shape: torch.Size([1, 13072]) Final batch size: 1, sequence length: 19325 Attention mask shape: torch.Size([1, 1, 19325, 19325]) Position ids shape: torch.Size([1, 19325]) Input IDs shape: torch.Size([1, 19325]) Labels shape: torch.Size([1, 19325]) Final batch size: 1, sequence length: 21322 Attention mask shape: torch.Size([1, 1, 21322, 21322]) Position ids shape: torch.Size([1, 21322]) Input IDs shape: torch.Size([1, 21322]) Labels shape: torch.Size([1, 21322]) Final batch size: 1, sequence length: 23437 Attention mask shape: torch.Size([1, 1, 23437, 23437]) Position ids shape: torch.Size([1, 23437]) Input IDs shape: torch.Size([1, 23437]) Labels shape: torch.Size([1, 23437]) Final batch size: 1, sequence length: 21907 Attention mask shape: torch.Size([1, 1, 21907, 21907]) Position ids shape: torch.Size([1, 21907]) Input IDs shape: torch.Size([1, 21907]) Labels shape: torch.Size([1, 21907]) Final batch size: 1, sequence length: 3010 Attention mask shape: torch.Size([1, 1, 3010, 3010]) Position ids shape: torch.Size([1, 3010]) Input IDs shape: torch.Size([1, 3010]) Labels shape: torch.Size([1, 3010]) Final batch size: 1, sequence length: 22778 Attention mask shape: torch.Size([1, 1, 22778, 22778]) Position ids shape: torch.Size([1, 22778]) Input IDs shape: torch.Size([1, 22778]) Labels shape: torch.Size([1, 22778]) Final batch size: 1, sequence length: 24107 Attention mask shape: torch.Size([1, 1, 24107, 24107]) Position ids shape: torch.Size([1, 24107]) Input IDs shape: torch.Size([1, 24107]) Labels shape: torch.Size([1, 24107]) Final batch size: 1, sequence length: 19724 Attention mask shape: torch.Size([1, 1, 19724, 19724]) Position ids shape: torch.Size([1, 19724]) Input IDs shape: torch.Size([1, 19724]) Labels shape: torch.Size([1, 19724]) Final batch size: 1, sequence length: 25934 Attention mask shape: torch.Size([1, 1, 25934, 25934]) Position ids shape: torch.Size([1, 25934]) Input IDs shape: torch.Size([1, 25934]) Labels shape: torch.Size([1, 25934]) Final batch size: 1, sequence length: 28474 Attention mask shape: torch.Size([1, 1, 28474, 28474]) Position ids shape: torch.Size([1, 28474]) Input IDs shape: torch.Size([1, 28474]) Labels shape: torch.Size([1, 28474]) Final batch size: 1, sequence length: 28012 Attention mask shape: torch.Size([1, 1, 28012, 28012]) Position ids shape: torch.Size([1, 28012]) Input IDs shape: torch.Size([1, 28012]) Labels shape: torch.Size([1, 28012]) Final batch size: 1, sequence length: 23442 Attention mask shape: torch.Size([1, 1, 23442, 23442]) Position ids shape: torch.Size([1, 23442]) Input IDs shape: torch.Size([1, 23442]) Labels shape: torch.Size([1, 23442]) Final batch size: 1, sequence length: 22851 Attention mask shape: torch.Size([1, 1, 22851, 22851]) Position ids shape: torch.Size([1, 22851]) Input IDs shape: torch.Size([1, 22851]) Labels shape: torch.Size([1, 22851]) Final batch size: 1, sequence length: 21853 Attention mask shape: torch.Size([1, 1, 21853, 21853]) Position ids shape: torch.Size([1, 21853]) Input IDs shape: torch.Size([1, 21853]) Labels shape: torch.Size([1, 21853]) Final batch size: 1, sequence length: 12234 Attention mask shape: torch.Size([1, 1, 12234, 12234]) Position ids shape: torch.Size([1, 12234]) Input IDs shape: torch.Size([1, 12234]) Labels shape: torch.Size([1, 12234]) Final batch size: 1, sequence length: 26454 Attention mask shape: torch.Size([1, 1, 26454, 26454]) Position ids shape: torch.Size([1, 26454]) Input IDs shape: torch.Size([1, 26454]) Labels shape: torch.Size([1, 26454]) Final batch size: 1, sequence length: 24364 Attention mask shape: torch.Size([1, 1, 24364, 24364]) Position ids shape: torch.Size([1, 24364]) Input IDs shape: torch.Size([1, 24364]) Labels shape: torch.Size([1, 24364]) Final batch size: 1, sequence length: 26424 Attention mask shape: torch.Size([1, 1, 26424, 26424]) Position ids shape: torch.Size([1, 26424]) Input IDs shape: torch.Size([1, 26424]) Labels shape: torch.Size([1, 26424]) Final batch size: 1, sequence length: 30235 Attention mask shape: torch.Size([1, 1, 30235, 30235]) Position ids shape: torch.Size([1, 30235]) Input IDs shape: torch.Size([1, 30235]) Labels shape: torch.Size([1, 30235]) Final batch size: 1, sequence length: 24716 Attention mask shape: torch.Size([1, 1, 24716, 24716]) Position ids shape: torch.Size([1, 24716]) Input IDs shape: torch.Size([1, 24716]) Labels shape: torch.Size([1, 24716]) Final batch size: 1, sequence length: 20314 Attention mask shape: torch.Size([1, 1, 20314, 20314]) Position ids shape: torch.Size([1, 20314]) Input IDs shape: torch.Size([1, 20314]) Labels shape: torch.Size([1, 20314]) Final batch size: 1, sequence length: 14137 Attention mask shape: torch.Size([1, 1, 14137, 14137]) Position ids shape: torch.Size([1, 14137]) Input IDs shape: torch.Size([1, 14137]) Labels shape: torch.Size([1, 14137]) Final batch size: 1, sequence length: 27384 Attention mask shape: torch.Size([1, 1, 27384, 27384]) Position ids shape: torch.Size([1, 27384]) Input IDs shape: torch.Size([1, 27384]) Labels shape: torch.Size([1, 27384]) Final batch size: 1, sequence length: 19027 Attention mask shape: torch.Size([1, 1, 19027, 19027]) Position ids shape: torch.Size([1, 19027]) Input IDs shape: torch.Size([1, 19027]) Labels shape: torch.Size([1, 19027]) Final batch size: 1, sequence length: 26500 Attention mask shape: torch.Size([1, 1, 26500, 26500]) Position ids shape: torch.Size([1, 26500]) Input IDs shape: torch.Size([1, 26500]) Labels shape: torch.Size([1, 26500]) Final batch size: 1, sequence length: 27520 Attention mask shape: torch.Size([1, 1, 27520, 27520]) Position ids shape: torch.Size([1, 27520]) Input IDs shape: torch.Size([1, 27520]) Labels shape: torch.Size([1, 27520]) Final batch size: 1, sequence length: 11662 Attention mask shape: torch.Size([1, 1, 11662, 11662]) Position ids shape: torch.Size([1, 11662]) Input IDs shape: torch.Size([1, 11662]) Labels shape: torch.Size([1, 11662]) Final batch size: 1, sequence length: 20028 Attention mask shape: torch.Size([1, 1, 20028, 20028]) Position ids shape: torch.Size([1, 20028]) Input IDs shape: torch.Size([1, 20028]) Labels shape: torch.Size([1, 20028]) Final batch size: 1, sequence length: 21168 Attention mask shape: torch.Size([1, 1, 21168, 21168]) Position ids shape: torch.Size([1, 21168]) Input IDs shape: torch.Size([1, 21168]) Labels shape: torch.Size([1, 21168]) Final batch size: 1, sequence length: 24111 Attention mask shape: torch.Size([1, 1, 24111, 24111]) Position ids shape: torch.Size([1, 24111]) Input IDs shape: torch.Size([1, 24111]) Labels shape: torch.Size([1, 24111]) Final batch size: 1, sequence length: 23855 Attention mask shape: torch.Size([1, 1, 23855, 23855]) Position ids shape: torch.Size([1, 23855]) Input IDs shape: torch.Size([1, 23855]) Labels shape: torch.Size([1, 23855]) Final batch size: 1, sequence length: 24923 Attention mask shape: torch.Size([1, 1, 24923, 24923]) Position ids shape: torch.Size([1, 24923]) Input IDs shape: torch.Size([1, 24923]) Labels shape: torch.Size([1, 24923]) Final batch size: 1, sequence length: 24830 Attention mask shape: torch.Size([1, 1, 24830, 24830]) Position ids shape: torch.Size([1, 24830]) Input IDs shape: torch.Size([1, 24830]) Labels shape: torch.Size([1, 24830]) Final batch size: 1, sequence length: 22920 Attention mask shape: torch.Size([1, 1, 22920, 22920]) Position ids shape: torch.Size([1, 22920]) Input IDs shape: torch.Size([1, 22920]) Labels shape: torch.Size([1, 22920]) Final batch size: 1, sequence length: 19302 Attention mask shape: torch.Size([1, 1, 19302, 19302]) Position ids shape: torch.Size([1, 19302]) Input IDs shape: torch.Size([1, 19302]) Labels shape: torch.Size([1, 19302]) Final batch size: 1, sequence length: 33765 Attention mask shape: torch.Size([1, 1, 33765, 33765]) Position ids shape: torch.Size([1, 33765]) Input IDs shape: torch.Size([1, 33765]) Labels shape: torch.Size([1, 33765]) Final batch size: 1, sequence length: 30041 Attention mask shape: torch.Size([1, 1, 30041, 30041]) Position ids shape: torch.Size([1, 30041]) Input IDs shape: torch.Size([1, 30041]) Labels shape: torch.Size([1, 30041]) Final batch size: 1, sequence length: 35555 Attention mask shape: torch.Size([1, 1, 35555, 35555]) Position ids shape: torch.Size([1, 35555]) Input IDs shape: torch.Size([1, 35555]) Labels shape: torch.Size([1, 35555]) Final batch size: 1, sequence length: 33179 Attention mask shape: torch.Size([1, 1, 33179, 33179]) Position ids shape: torch.Size([1, 33179]) Input IDs shape: torch.Size([1, 33179]) Labels shape: torch.Size([1, 33179]) Final batch size: 1, sequence length: 24187 Attention mask shape: torch.Size([1, 1, 24187, 24187]) Position ids shape: torch.Size([1, 24187]) Input IDs shape: torch.Size([1, 24187]) Labels shape: torch.Size([1, 24187]) Final batch size: 1, sequence length: 29842 Attention mask shape: torch.Size([1, 1, 29842, 29842]) Position ids shape: torch.Size([1, 29842]) Input IDs shape: torch.Size([1, 29842]) Labels shape: torch.Size([1, 29842]) Final batch size: 1, sequence length: 12519 Attention mask shape: torch.Size([1, 1, 12519, 12519]) Position ids shape: torch.Size([1, 12519]) Input IDs shape: torch.Size([1, 12519]) Labels shape: torch.Size([1, 12519]) Final batch size: 1, sequence length: 31404 Attention mask shape: torch.Size([1, 1, 31404, 31404]) Position ids shape: torch.Size([1, 31404]) Input IDs shape: torch.Size([1, 31404]) Labels shape: torch.Size([1, 31404]) Final batch size: 1, sequence length: 33321 Attention mask shape: torch.Size([1, 1, 33321, 33321]) Position ids shape: torch.Size([1, 33321]) Input IDs shape: torch.Size([1, 33321]) Labels shape: torch.Size([1, 33321]) Final batch size: 1, sequence length: 23695 Attention mask shape: torch.Size([1, 1, 23695, 23695]) Position ids shape: torch.Size([1, 23695]) Input IDs shape: torch.Size([1, 23695]) Labels shape: torch.Size([1, 23695]) Final batch size: 1, sequence length: 17763 Attention mask shape: torch.Size([1, 1, 17763, 17763]) Position ids shape: torch.Size([1, 17763]) Input IDs shape: torch.Size([1, 17763]) Labels shape: torch.Size([1, 17763]) Final batch size: 1, sequence length: 29743 Attention mask shape: torch.Size([1, 1, 29743, 29743]) Position ids shape: torch.Size([1, 29743]) Input IDs shape: torch.Size([1, 29743]) Labels shape: torch.Size([1, 29743]) Final batch size: 1, sequence length: 34169 Attention mask shape: torch.Size([1, 1, 34169, 34169]) Position ids shape: torch.Size([1, 34169]) Input IDs shape: torch.Size([1, 34169]) Labels shape: torch.Size([1, 34169]) Final batch size: 1, sequence length: 29385 Attention mask shape: torch.Size([1, 1, 29385, 29385]) Position ids shape: torch.Size([1, 29385]) Input IDs shape: torch.Size([1, 29385]) Labels shape: torch.Size([1, 29385]) Final batch size: 1, sequence length: 27107 Attention mask shape: torch.Size([1, 1, 27107, 27107]) Position ids shape: torch.Size([1, 27107]) Input IDs shape: torch.Size([1, 27107]) Labels shape: torch.Size([1, 27107]) Final batch size: 1, sequence length: 27738 Attention mask shape: torch.Size([1, 1, 27738, 27738]) Position ids shape: torch.Size([1, 27738]) Input IDs shape: torch.Size([1, 27738]) Labels shape: torch.Size([1, 27738]) Final batch size: 1, sequence length: 40006 Attention mask shape: torch.Size([1, 1, 40006, 40006]) Position ids shape: torch.Size([1, 40006]) Input IDs shape: torch.Size([1, 40006]) Labels shape: torch.Size([1, 40006]) Final batch size: 1, sequence length: 38576 Attention mask shape: torch.Size([1, 1, 38576, 38576]) Position ids shape: torch.Size([1, 38576]) Input IDs shape: torch.Size([1, 38576]) Labels shape: torch.Size([1, 38576]) Final batch size: 1, sequence length: 26696 Attention mask shape: torch.Size([1, 1, 26696, 26696]) Position ids shape: torch.Size([1, 26696]) Input IDs shape: torch.Size([1, 26696]) Labels shape: torch.Size([1, 26696]) Final batch size: 1, sequence length: 10139 Attention mask shape: torch.Size([1, 1, 10139, 10139]) Position ids shape: torch.Size([1, 10139]) Input IDs shape: torch.Size([1, 10139]) Labels shape: torch.Size([1, 10139]) Final batch size: 1, sequence length: 31115 Attention mask shape: torch.Size([1, 1, 31115, 31115]) Position ids shape: torch.Size([1, 31115]) Input IDs shape: torch.Size([1, 31115]) Labels shape: torch.Size([1, 31115]) Final batch size: 1, sequence length: 32217 Attention mask shape: torch.Size([1, 1, 32217, 32217]) Position ids shape: torch.Size([1, 32217]) Input IDs shape: torch.Size([1, 32217]) Labels shape: torch.Size([1, 32217]) Final batch size: 1, sequence length: 37657 Attention mask shape: torch.Size([1, 1, 37657, 37657]) Position ids shape: torch.Size([1, 37657]) Input IDs shape: torch.Size([1, 37657]) Labels shape: torch.Size([1, 37657]) Final batch size: 1, sequence length: 26685 Attention mask shape: torch.Size([1, 1, 26685, 26685]) Position ids shape: torch.Size([1, 26685]) Input IDs shape: torch.Size([1, 26685]) Labels shape: torch.Size([1, 26685]) Final batch size: 1, sequence length: 18734 Attention mask shape: torch.Size([1, 1, 18734, 18734]) Position ids shape: torch.Size([1, 18734]) Input IDs shape: torch.Size([1, 18734]) Labels shape: torch.Size([1, 18734]) Final batch size: 1, sequence length: 29085 Attention mask shape: torch.Size([1, 1, 29085, 29085]) Position ids shape: torch.Size([1, 29085]) Input IDs shape: torch.Size([1, 29085]) Labels shape: torch.Size([1, 29085]) Final batch size: 1, sequence length: 39018 Attention mask shape: torch.Size([1, 1, 39018, 39018]) Position ids shape: torch.Size([1, 39018]) Input IDs shape: torch.Size([1, 39018]) Labels shape: torch.Size([1, 39018]) Final batch size: 1, sequence length: 29286 Attention mask shape: torch.Size([1, 1, 29286, 29286]) Position ids shape: torch.Size([1, 29286]) Input IDs shape: torch.Size([1, 29286]) Labels shape: torch.Size([1, 29286]) Final batch size: 1, sequence length: 19138 Attention mask shape: torch.Size([1, 1, 19138, 19138]) Position ids shape: torch.Size([1, 19138]) Input IDs shape: torch.Size([1, 19138]) Labels shape: torch.Size([1, 19138]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 31991 Attention mask shape: torch.Size([1, 1, 31991, 31991]) Position ids shape: torch.Size([1, 31991]) Input IDs shape: torch.Size([1, 31991]) Labels shape: torch.Size([1, 31991]) Final batch size: 1, sequence length: 35949 Attention mask shape: torch.Size([1, 1, 35949, 35949]) Position ids shape: torch.Size([1, 35949]) Input IDs shape: torch.Size([1, 35949]) Labels shape: torch.Size([1, 35949]) Final batch size: 1, sequence length: 23848 Attention mask shape: torch.Size([1, 1, 23848, 23848]) Position ids shape: torch.Size([1, 23848]) Input IDs shape: torch.Size([1, 23848]) Labels shape: torch.Size([1, 23848]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 25990 Attention mask shape: torch.Size([1, 1, 25990, 25990]) Position ids shape: torch.Size([1, 25990]) Input IDs shape: torch.Size([1, 25990]) Labels shape: torch.Size([1, 25990]) Final batch size: 1, sequence length: 32762 Attention mask shape: torch.Size([1, 1, 32762, 32762]) Position ids shape: torch.Size([1, 32762]) Input IDs shape: torch.Size([1, 32762]) Labels shape: torch.Size([1, 32762]) Final batch size: 1, sequence length: 22500 Attention mask shape: torch.Size([1, 1, 22500, 22500]) Position ids shape: torch.Size([1, 22500]) Input IDs shape: torch.Size([1, 22500]) Labels shape: torch.Size([1, 22500]) Final batch size: 1, sequence length: 37680 Attention mask shape: torch.Size([1, 1, 37680, 37680]) Position ids shape: torch.Size([1, 37680]) Input IDs shape: torch.Size([1, 37680]) Labels shape: torch.Size([1, 37680]) Final batch size: 1, sequence length: 40834 Attention mask shape: torch.Size([1, 1, 40834, 40834]) Position ids shape: torch.Size([1, 40834]) Input IDs shape: torch.Size([1, 40834]) Labels shape: torch.Size([1, 40834]) Final batch size: 1, sequence length: 34628 Attention mask shape: torch.Size([1, 1, 34628, 34628]) Position ids shape: torch.Size([1, 34628]) Input IDs shape: torch.Size([1, 34628]) Labels shape: torch.Size([1, 34628]) Final batch size: 1, sequence length: 26372 Attention mask shape: torch.Size([1, 1, 26372, 26372]) Position ids shape: torch.Size([1, 26372]) Input IDs shape: torch.Size([1, 26372]) Labels shape: torch.Size([1, 26372]) Final batch size: 1, sequence length: 22778 Attention mask shape: torch.Size([1, 1, 22778, 22778]) Position ids shape: torch.Size([1, 22778]) Input IDs shape: torch.Size([1, 22778]) Labels shape: torch.Size([1, 22778]) Final batch size: 1, sequence length: 32675 Attention mask shape: torch.Size([1, 1, 32675, 32675]) Position ids shape: torch.Size([1, 32675]) Input IDs shape: torch.Size([1, 32675]) Labels shape: torch.Size([1, 32675]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 34791 Attention mask shape: torch.Size([1, 1, 34791, 34791]) Position ids shape: torch.Size([1, 34791]) Input IDs shape: torch.Size([1, 34791]) Labels shape: torch.Size([1, 34791]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) {'loss': 0.2495, 'grad_norm': 0.23619268871607607, 'learning_rate': 2.061073738537635e-06, 'num_tokens': -inf, 'epoch': 5.88} Final batch size: 1, sequence length: 5686 Attention mask shape: torch.Size([1, 1, 5686, 5686]) Position ids shape: torch.Size([1, 5686]) Input IDs shape: torch.Size([1, 5686]) Labels shape: torch.Size([1, 5686]) Final batch size: 1, sequence length: 6986 Attention mask shape: torch.Size([1, 1, 6986, 6986]) Position ids shape: torch.Size([1, 6986]) Input IDs shape: torch.Size([1, 6986]) Labels shape: torch.Size([1, 6986]) Final batch size: 1, sequence length: 12847 Attention mask shape: torch.Size([1, 1, 12847, 12847]) Position ids shape: torch.Size([1, 12847]) Input IDs shape: torch.Size([1, 12847]) Labels shape: torch.Size([1, 12847]) Final batch size: 1, sequence length: 11304 Attention mask shape: torch.Size([1, 1, 11304, 11304]) Position ids shape: torch.Size([1, 11304]) Input IDs shape: torch.Size([1, 11304]) Labels shape: torch.Size([1, 11304]) Final batch size: 1, sequence length: 14304 Attention mask shape: torch.Size([1, 1, 14304, 14304]) Position ids shape: torch.Size([1, 14304]) Input IDs shape: torch.Size([1, 14304]) Labels shape: torch.Size([1, 14304]) Final batch size: 1, sequence length: 15621 Attention mask shape: torch.Size([1, 1, 15621, 15621]) Position ids shape: torch.Size([1, 15621]) Input IDs shape: torch.Size([1, 15621]) Labels shape: torch.Size([1, 15621]) Final batch size: 1, sequence length: 13872 Attention mask shape: torch.Size([1, 1, 13872, 13872]) Position ids shape: torch.Size([1, 13872]) Input IDs shape: torch.Size([1, 13872]) Labels shape: torch.Size([1, 13872]) Final batch size: 1, sequence length: 13224 Attention mask shape: torch.Size([1, 1, 13224, 13224]) Position ids shape: torch.Size([1, 13224]) Input IDs shape: torch.Size([1, 13224]) Labels shape: torch.Size([1, 13224]) Final batch size: 1, sequence length: 14168 Attention mask shape: torch.Size([1, 1, 14168, 14168]) Position ids shape: torch.Size([1, 14168]) Input IDs shape: torch.Size([1, 14168]) Labels shape: torch.Size([1, 14168]) Final batch size: 1, sequence length: 16514 Attention mask shape: torch.Size([1, 1, 16514, 16514]) Position ids shape: torch.Size([1, 16514]) Input IDs shape: torch.Size([1, 16514]) Labels shape: torch.Size([1, 16514]) Final batch size: 1, sequence length: 16440 Attention mask shape: torch.Size([1, 1, 16440, 16440]) Position ids shape: torch.Size([1, 16440]) Input IDs shape: torch.Size([1, 16440]) Labels shape: torch.Size([1, 16440]) Final batch size: 1, sequence length: 15652 Attention mask shape: torch.Size([1, 1, 15652, 15652]) Position ids shape: torch.Size([1, 15652]) Input IDs shape: torch.Size([1, 15652]) Labels shape: torch.Size([1, 15652]) Final batch size: 1, sequence length: 18246 Attention mask shape: torch.Size([1, 1, 18246, 18246]) Position ids shape: torch.Size([1, 18246]) Input IDs shape: torch.Size([1, 18246]) Labels shape: torch.Size([1, 18246]) Final batch size: 1, sequence length: 17852 Attention mask shape: torch.Size([1, 1, 17852, 17852]) Position ids shape: torch.Size([1, 17852]) Input IDs shape: torch.Size([1, 17852]) Labels shape: torch.Size([1, 17852]) Final batch size: 1, sequence length: 17213 Attention mask shape: torch.Size([1, 1, 17213, 17213]) Position ids shape: torch.Size([1, 17213]) Input IDs shape: torch.Size([1, 17213]) Labels shape: torch.Size([1, 17213]) Final batch size: 1, sequence length: 17773 Attention mask shape: torch.Size([1, 1, 17773, 17773]) Position ids shape: torch.Size([1, 17773]) Input IDs shape: torch.Size([1, 17773]) Labels shape: torch.Size([1, 17773]) Final batch size: 1, sequence length: 16501 Attention mask shape: torch.Size([1, 1, 16501, 16501]) Position ids shape: torch.Size([1, 16501]) Input IDs shape: torch.Size([1, 16501]) Labels shape: torch.Size([1, 16501]) Final batch size: 1, sequence length: 16378 Attention mask shape: torch.Size([1, 1, 16378, 16378]) Position ids shape: torch.Size([1, 16378]) Input IDs shape: torch.Size([1, 16378]) Labels shape: torch.Size([1, 16378]) Final batch size: 1, sequence length: 21163 Attention mask shape: torch.Size([1, 1, 21163, 21163]) Position ids shape: torch.Size([1, 21163]) Input IDs shape: torch.Size([1, 21163]) Labels shape: torch.Size([1, 21163]) Final batch size: 1, sequence length: 18044 Attention mask shape: torch.Size([1, 1, 18044, 18044]) Position ids shape: torch.Size([1, 18044]) Input IDs shape: torch.Size([1, 18044]) Labels shape: torch.Size([1, 18044]) Final batch size: 1, sequence length: 18289 Attention mask shape: torch.Size([1, 1, 18289, 18289]) Position ids shape: torch.Size([1, 18289]) Input IDs shape: torch.Size([1, 18289]) Labels shape: torch.Size([1, 18289]) Final batch size: 1, sequence length: 21207 Attention mask shape: torch.Size([1, 1, 21207, 21207]) Position ids shape: torch.Size([1, 21207]) Input IDs shape: torch.Size([1, 21207]) Labels shape: torch.Size([1, 21207]) Final batch size: 1, sequence length: 20176 Attention mask shape: torch.Size([1, 1, 20176, 20176]) Position ids shape: torch.Size([1, 20176]) Input IDs shape: torch.Size([1, 20176]) Labels shape: torch.Size([1, 20176]) Final batch size: 1, sequence length: 18155 Attention mask shape: torch.Size([1, 1, 18155, 18155]) Position ids shape: torch.Size([1, 18155]) Input IDs shape: torch.Size([1, 18155]) Labels shape: torch.Size([1, 18155]) Final batch size: 1, sequence length: 20813 Attention mask shape: torch.Size([1, 1, 20813, 20813]) Position ids shape: torch.Size([1, 20813]) Input IDs shape: torch.Size([1, 20813]) Labels shape: torch.Size([1, 20813]) Final batch size: 1, sequence length: 22784 Attention mask shape: torch.Size([1, 1, 22784, 22784]) Position ids shape: torch.Size([1, 22784]) Input IDs shape: torch.Size([1, 22784]) Labels shape: torch.Size([1, 22784]) Final batch size: 1, sequence length: 21562 Attention mask shape: torch.Size([1, 1, 21562, 21562]) Position ids shape: torch.Size([1, 21562]) Input IDs shape: torch.Size([1, 21562]) Labels shape: torch.Size([1, 21562]) Final batch size: 1, sequence length: 20766 Attention mask shape: torch.Size([1, 1, 20766, 20766]) Position ids shape: torch.Size([1, 20766]) Input IDs shape: torch.Size([1, 20766]) Labels shape: torch.Size([1, 20766]) Final batch size: 1, sequence length: 22152 Attention mask shape: torch.Size([1, 1, 22152, 22152]) Position ids shape: torch.Size([1, 22152]) Input IDs shape: torch.Size([1, 22152]) Labels shape: torch.Size([1, 22152]) Final batch size: 1, sequence length: 21607 Attention mask shape: torch.Size([1, 1, 21607, 21607]) Position ids shape: torch.Size([1, 21607]) Input IDs shape: torch.Size([1, 21607]) Labels shape: torch.Size([1, 21607]) Final batch size: 1, sequence length: 22117 Attention mask shape: torch.Size([1, 1, 22117, 22117]) Position ids shape: torch.Size([1, 22117]) Input IDs shape: torch.Size([1, 22117]) Labels shape: torch.Size([1, 22117]) Final batch size: 1, sequence length: 21705 Attention mask shape: torch.Size([1, 1, 21705, 21705]) Position ids shape: torch.Size([1, 21705]) Input IDs shape: torch.Size([1, 21705]) Labels shape: torch.Size([1, 21705]) Final batch size: 1, sequence length: 21316 Attention mask shape: torch.Size([1, 1, 21316, 21316]) Position ids shape: torch.Size([1, 21316]) Input IDs shape: torch.Size([1, 21316]) Labels shape: torch.Size([1, 21316]) Final batch size: 1, sequence length: 22197 Attention mask shape: torch.Size([1, 1, 22197, 22197]) Position ids shape: torch.Size([1, 22197]) Input IDs shape: torch.Size([1, 22197]) Labels shape: torch.Size([1, 22197]) Final batch size: 1, sequence length: 25880 Attention mask shape: torch.Size([1, 1, 25880, 25880]) Position ids shape: torch.Size([1, 25880]) Input IDs shape: torch.Size([1, 25880]) Labels shape: torch.Size([1, 25880]) Final batch size: 1, sequence length: 25053 Attention mask shape: torch.Size([1, 1, 25053, 25053]) Position ids shape: torch.Size([1, 25053]) Input IDs shape: torch.Size([1, 25053]) Labels shape: torch.Size([1, 25053]) Final batch size: 1, sequence length: 26921 Attention mask shape: torch.Size([1, 1, 26921, 26921]) Position ids shape: torch.Size([1, 26921]) Input IDs shape: torch.Size([1, 26921]) Labels shape: torch.Size([1, 26921]) Final batch size: 1, sequence length: 26303 Attention mask shape: torch.Size([1, 1, 26303, 26303]) Position ids shape: torch.Size([1, 26303]) Input IDs shape: torch.Size([1, 26303]) Labels shape: torch.Size([1, 26303]) Final batch size: 1, sequence length: 26494 Attention mask shape: torch.Size([1, 1, 26494, 26494]) Position ids shape: torch.Size([1, 26494]) Input IDs shape: torch.Size([1, 26494]) Labels shape: torch.Size([1, 26494]) Final batch size: 1, sequence length: 27005 Attention mask shape: torch.Size([1, 1, 27005, 27005]) Position ids shape: torch.Size([1, 27005]) Input IDs shape: torch.Size([1, 27005]) Labels shape: torch.Size([1, 27005]) Final batch size: 1, sequence length: 28523 Attention mask shape: torch.Size([1, 1, 28523, 28523]) Position ids shape: torch.Size([1, 28523]) Input IDs shape: torch.Size([1, 28523]) Labels shape: torch.Size([1, 28523]) Final batch size: 1, sequence length: 30592 Attention mask shape: torch.Size([1, 1, 30592, 30592]) Position ids shape: torch.Size([1, 30592]) Input IDs shape: torch.Size([1, 30592]) Labels shape: torch.Size([1, 30592]) Final batch size: 1, sequence length: 30151 Attention mask shape: torch.Size([1, 1, 30151, 30151]) Position ids shape: torch.Size([1, 30151]) Input IDs shape: torch.Size([1, 30151]) Labels shape: torch.Size([1, 30151]) Final batch size: 1, sequence length: 28897 Attention mask shape: torch.Size([1, 1, 28897, 28897]) Position ids shape: torch.Size([1, 28897]) Input IDs shape: torch.Size([1, 28897]) Labels shape: torch.Size([1, 28897]) Final batch size: 1, sequence length: 33482 Attention mask shape: torch.Size([1, 1, 33482, 33482]) Position ids shape: torch.Size([1, 33482]) Input IDs shape: torch.Size([1, 33482]) Labels shape: torch.Size([1, 33482]) Final batch size: 1, sequence length: 32473 Attention mask shape: torch.Size([1, 1, 32473, 32473]) Position ids shape: torch.Size([1, 32473]) Input IDs shape: torch.Size([1, 32473]) Labels shape: torch.Size([1, 32473]) Final batch size: 1, sequence length: 35100 Attention mask shape: torch.Size([1, 1, 35100, 35100]) Position ids shape: torch.Size([1, 35100]) Input IDs shape: torch.Size([1, 35100]) Labels shape: torch.Size([1, 35100]) Final batch size: 1, sequence length: 32596 Attention mask shape: torch.Size([1, 1, 32596, 32596]) Position ids shape: torch.Size([1, 32596]) Input IDs shape: torch.Size([1, 32596]) Labels shape: torch.Size([1, 32596]) Final batch size: 1, sequence length: 35030 Attention mask shape: torch.Size([1, 1, 35030, 35030]) Position ids shape: torch.Size([1, 35030]) Input IDs shape: torch.Size([1, 35030]) Labels shape: torch.Size([1, 35030]) Final batch size: 1, sequence length: 35866 Attention mask shape: torch.Size([1, 1, 35866, 35866]) Position ids shape: torch.Size([1, 35866]) Input IDs shape: torch.Size([1, 35866]) Labels shape: torch.Size([1, 35866]) Final batch size: 1, sequence length: 34469 Attention mask shape: torch.Size([1, 1, 34469, 34469]) Position ids shape: torch.Size([1, 34469]) Input IDs shape: torch.Size([1, 34469]) Labels shape: torch.Size([1, 34469]) Final batch size: 1, sequence length: 35485 Attention mask shape: torch.Size([1, 1, 35485, 35485]) Position ids shape: torch.Size([1, 35485]) Input IDs shape: torch.Size([1, 35485]) Labels shape: torch.Size([1, 35485]) Final batch size: 1, sequence length: 35523 Attention mask shape: torch.Size([1, 1, 35523, 35523]) Position ids shape: torch.Size([1, 35523]) Input IDs shape: torch.Size([1, 35523]) Labels shape: torch.Size([1, 35523]) Final batch size: 1, sequence length: 37980 Attention mask shape: torch.Size([1, 1, 37980, 37980]) Position ids shape: torch.Size([1, 37980]) Input IDs shape: torch.Size([1, 37980]) Labels shape: torch.Size([1, 37980]) Final batch size: 1, sequence length: 40526 Attention mask shape: torch.Size([1, 1, 40526, 40526]) Position ids shape: torch.Size([1, 40526]) Input IDs shape: torch.Size([1, 40526]) Labels shape: torch.Size([1, 40526]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 37040 Attention mask shape: torch.Size([1, 1, 37040, 37040]) Position ids shape: torch.Size([1, 37040]) Input IDs shape: torch.Size([1, 37040]) Labels shape: torch.Size([1, 37040]) Final batch size: 1, sequence length: 39589 Attention mask shape: torch.Size([1, 1, 39589, 39589]) Position ids shape: torch.Size([1, 39589]) Input IDs shape: torch.Size([1, 39589]) Labels shape: torch.Size([1, 39589]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 38927 Attention mask shape: torch.Size([1, 1, 38927, 38927]) Position ids shape: torch.Size([1, 38927]) Input IDs shape: torch.Size([1, 38927]) Labels shape: torch.Size([1, 38927]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) {'loss': 0.2422, 'grad_norm': 0.2596387270781892, 'learning_rate': 1.8533980447508138e-06, 'num_tokens': -inf, 'epoch': 6.0} Final batch size: 1, sequence length: 5686 Attention mask shape: torch.Size([1, 1, 5686, 5686]) Position ids shape: torch.Size([1, 5686]) Input IDs shape: torch.Size([1, 5686]) Labels shape: torch.Size([1, 5686]) Final batch size: 1, sequence length: 6986 Attention mask shape: torch.Size([1, 1, 6986, 6986]) Position ids shape: torch.Size([1, 6986]) Input IDs shape: torch.Size([1, 6986]) Labels shape: torch.Size([1, 6986]) Final batch size: 1, sequence length: 6362 Attention mask shape: torch.Size([1, 1, 6362, 6362]) Position ids shape: torch.Size([1, 6362]) Input IDs shape: torch.Size([1, 6362]) Labels shape: torch.Size([1, 6362]) Final batch size: 1, sequence length: 11304 Attention mask shape: torch.Size([1, 1, 11304, 11304]) Position ids shape: torch.Size([1, 11304]) Input IDs shape: torch.Size([1, 11304]) Labels shape: torch.Size([1, 11304]) Final batch size: 1, sequence length: 12847 Attention mask shape: torch.Size([1, 1, 12847, 12847]) Position ids shape: torch.Size([1, 12847]) Input IDs shape: torch.Size([1, 12847]) Labels shape: torch.Size([1, 12847]) Final batch size: 1, sequence length: 14304 Attention mask shape: torch.Size([1, 1, 14304, 14304]) Position ids shape: torch.Size([1, 14304]) Input IDs shape: torch.Size([1, 14304]) Labels shape: torch.Size([1, 14304]) Final batch size: 1, sequence length: 13224 Attention mask shape: torch.Size([1, 1, 13224, 13224]) Position ids shape: torch.Size([1, 13224]) Input IDs shape: torch.Size([1, 13224]) Labels shape: torch.Size([1, 13224]) Final batch size: 1, sequence length: 13872 Attention mask shape: torch.Size([1, 1, 13872, 13872]) Position ids shape: torch.Size([1, 13872]) Input IDs shape: torch.Size([1, 13872]) Labels shape: torch.Size([1, 13872]) Final batch size: 1, sequence length: 10523 Attention mask shape: torch.Size([1, 1, 10523, 10523]) Position ids shape: torch.Size([1, 10523]) Input IDs shape: torch.Size([1, 10523]) Labels shape: torch.Size([1, 10523]) Final batch size: 1, sequence length: 15652 Attention mask shape: torch.Size([1, 1, 15652, 15652]) Position ids shape: torch.Size([1, 15652]) Input IDs shape: torch.Size([1, 15652]) Labels shape: torch.Size([1, 15652]) Final batch size: 1, sequence length: 14168 Attention mask shape: torch.Size([1, 1, 14168, 14168]) Position ids shape: torch.Size([1, 14168]) Input IDs shape: torch.Size([1, 14168]) Labels shape: torch.Size([1, 14168]) Final batch size: 1, sequence length: 16378 Attention mask shape: torch.Size([1, 1, 16378, 16378]) Position ids shape: torch.Size([1, 16378]) Input IDs shape: torch.Size([1, 16378]) Labels shape: torch.Size([1, 16378]) Final batch size: 1, sequence length: 16501 Attention mask shape: torch.Size([1, 1, 16501, 16501]) Position ids shape: torch.Size([1, 16501]) Input IDs shape: torch.Size([1, 16501]) Labels shape: torch.Size([1, 16501]) Final batch size: 1, sequence length: 17213 Attention mask shape: torch.Size([1, 1, 17213, 17213]) Position ids shape: torch.Size([1, 17213]) Input IDs shape: torch.Size([1, 17213]) Labels shape: torch.Size([1, 17213]) Final batch size: 1, sequence length: 16514 Attention mask shape: torch.Size([1, 1, 16514, 16514]) Position ids shape: torch.Size([1, 16514]) Input IDs shape: torch.Size([1, 16514]) Labels shape: torch.Size([1, 16514]) Final batch size: 1, sequence length: 17852 Attention mask shape: torch.Size([1, 1, 17852, 17852]) Position ids shape: torch.Size([1, 17852]) Input IDs shape: torch.Size([1, 17852]) Labels shape: torch.Size([1, 17852]) Final batch size: 1, sequence length: 18289 Attention mask shape: torch.Size([1, 1, 18289, 18289]) Position ids shape: torch.Size([1, 18289]) Input IDs shape: torch.Size([1, 18289]) Labels shape: torch.Size([1, 18289]) Final batch size: 1, sequence length: 18246 Attention mask shape: torch.Size([1, 1, 18246, 18246]) Position ids shape: torch.Size([1, 18246]) Input IDs shape: torch.Size([1, 18246]) Labels shape: torch.Size([1, 18246]) Final batch size: 1, sequence length: 16440 Attention mask shape: torch.Size([1, 1, 16440, 16440]) Position ids shape: torch.Size([1, 16440]) Input IDs shape: torch.Size([1, 16440]) Labels shape: torch.Size([1, 16440]) Final batch size: 1, sequence length: 15621 Attention mask shape: torch.Size([1, 1, 15621, 15621]) Position ids shape: torch.Size([1, 15621]) Input IDs shape: torch.Size([1, 15621]) Labels shape: torch.Size([1, 15621]) Final batch size: 1, sequence length: 20176 Attention mask shape: torch.Size([1, 1, 20176, 20176]) Position ids shape: torch.Size([1, 20176]) Input IDs shape: torch.Size([1, 20176]) Labels shape: torch.Size([1, 20176]) Final batch size: 1, sequence length: 18155 Attention mask shape: torch.Size([1, 1, 18155, 18155]) Position ids shape: torch.Size([1, 18155]) Input IDs shape: torch.Size([1, 18155]) Labels shape: torch.Size([1, 18155]) Final batch size: 1, sequence length: 5911 Attention mask shape: torch.Size([1, 1, 5911, 5911]) Position ids shape: torch.Size([1, 5911]) Input IDs shape: torch.Size([1, 5911]) Labels shape: torch.Size([1, 5911]) Final batch size: 1, sequence length: 18044 Attention mask shape: torch.Size([1, 1, 18044, 18044]) Position ids shape: torch.Size([1, 18044]) Input IDs shape: torch.Size([1, 18044]) Labels shape: torch.Size([1, 18044]) Final batch size: 1, sequence length: 21163 Attention mask shape: torch.Size([1, 1, 21163, 21163]) Position ids shape: torch.Size([1, 21163]) Input IDs shape: torch.Size([1, 21163]) Labels shape: torch.Size([1, 21163]) Final batch size: 1, sequence length: 20813 Attention mask shape: torch.Size([1, 1, 20813, 20813]) Position ids shape: torch.Size([1, 20813]) Input IDs shape: torch.Size([1, 20813]) Labels shape: torch.Size([1, 20813]) Final batch size: 1, sequence length: 21705 Attention mask shape: torch.Size([1, 1, 21705, 21705]) Position ids shape: torch.Size([1, 21705]) Input IDs shape: torch.Size([1, 21705]) Labels shape: torch.Size([1, 21705]) Final batch size: 1, sequence length: 21607 Attention mask shape: torch.Size([1, 1, 21607, 21607]) Position ids shape: torch.Size([1, 21607]) Input IDs shape: torch.Size([1, 21607]) Labels shape: torch.Size([1, 21607]) Final batch size: 1, sequence length: 21316 Attention mask shape: torch.Size([1, 1, 21316, 21316]) Position ids shape: torch.Size([1, 21316]) Input IDs shape: torch.Size([1, 21316]) Labels shape: torch.Size([1, 21316]) Final batch size: 1, sequence length: 21962 Attention mask shape: torch.Size([1, 1, 21962, 21962]) Position ids shape: torch.Size([1, 21962]) Input IDs shape: torch.Size([1, 21962]) Labels shape: torch.Size([1, 21962]) Final batch size: 1, sequence length: 18545 Attention mask shape: torch.Size([1, 1, 18545, 18545]) Position ids shape: torch.Size([1, 18545]) Input IDs shape: torch.Size([1, 18545]) Labels shape: torch.Size([1, 18545]) Final batch size: 1, sequence length: 22784 Attention mask shape: torch.Size([1, 1, 22784, 22784]) Position ids shape: torch.Size([1, 22784]) Input IDs shape: torch.Size([1, 22784]) Labels shape: torch.Size([1, 22784]) Final batch size: 1, sequence length: 22117 Attention mask shape: torch.Size([1, 1, 22117, 22117]) Position ids shape: torch.Size([1, 22117]) Input IDs shape: torch.Size([1, 22117]) Labels shape: torch.Size([1, 22117]) Final batch size: 1, sequence length: 21562 Attention mask shape: torch.Size([1, 1, 21562, 21562]) Position ids shape: torch.Size([1, 21562]) Input IDs shape: torch.Size([1, 21562]) Labels shape: torch.Size([1, 21562]) Final batch size: 1, sequence length: 20766 Attention mask shape: torch.Size([1, 1, 20766, 20766]) Position ids shape: torch.Size([1, 20766]) Input IDs shape: torch.Size([1, 20766]) Labels shape: torch.Size([1, 20766]) Final batch size: 1, sequence length: 17910 Attention mask shape: torch.Size([1, 1, 17910, 17910]) Position ids shape: torch.Size([1, 17910]) Input IDs shape: torch.Size([1, 17910]) Labels shape: torch.Size([1, 17910]) Final batch size: 1, sequence length: 17773 Attention mask shape: torch.Size([1, 1, 17773, 17773]) Position ids shape: torch.Size([1, 17773]) Input IDs shape: torch.Size([1, 17773]) Labels shape: torch.Size([1, 17773]) Final batch size: 1, sequence length: 22152 Attention mask shape: torch.Size([1, 1, 22152, 22152]) Position ids shape: torch.Size([1, 22152]) Input IDs shape: torch.Size([1, 22152]) Labels shape: torch.Size([1, 22152]) Final batch size: 1, sequence length: 21207 Attention mask shape: torch.Size([1, 1, 21207, 21207]) Position ids shape: torch.Size([1, 21207]) Input IDs shape: torch.Size([1, 21207]) Labels shape: torch.Size([1, 21207]) Final batch size: 1, sequence length: 22197 Attention mask shape: torch.Size([1, 1, 22197, 22197]) Position ids shape: torch.Size([1, 22197]) Input IDs shape: torch.Size([1, 22197]) Labels shape: torch.Size([1, 22197]) Final batch size: 1, sequence length: 25053 Attention mask shape: torch.Size([1, 1, 25053, 25053]) Position ids shape: torch.Size([1, 25053]) Input IDs shape: torch.Size([1, 25053]) Labels shape: torch.Size([1, 25053]) Final batch size: 1, sequence length: 19962 Attention mask shape: torch.Size([1, 1, 19962, 19962]) Position ids shape: torch.Size([1, 19962]) Input IDs shape: torch.Size([1, 19962]) Labels shape: torch.Size([1, 19962]) Final batch size: 1, sequence length: 12421 Attention mask shape: torch.Size([1, 1, 12421, 12421]) Position ids shape: torch.Size([1, 12421]) Input IDs shape: torch.Size([1, 12421]) Labels shape: torch.Size([1, 12421]) Final batch size: 1, sequence length: 24365 Attention mask shape: torch.Size([1, 1, 24365, 24365]) Position ids shape: torch.Size([1, 24365]) Input IDs shape: torch.Size([1, 24365]) Labels shape: torch.Size([1, 24365]) Final batch size: 1, sequence length: 26303 Attention mask shape: torch.Size([1, 1, 26303, 26303]) Position ids shape: torch.Size([1, 26303]) Input IDs shape: torch.Size([1, 26303]) Labels shape: torch.Size([1, 26303]) Final batch size: 1, sequence length: 20101 Attention mask shape: torch.Size([1, 1, 20101, 20101]) Position ids shape: torch.Size([1, 20101]) Input IDs shape: torch.Size([1, 20101]) Labels shape: torch.Size([1, 20101]) Final batch size: 1, sequence length: 26494 Attention mask shape: torch.Size([1, 1, 26494, 26494]) Position ids shape: torch.Size([1, 26494]) Input IDs shape: torch.Size([1, 26494]) Labels shape: torch.Size([1, 26494]) Final batch size: 1, sequence length: 26921 Attention mask shape: torch.Size([1, 1, 26921, 26921]) Position ids shape: torch.Size([1, 26921]) Input IDs shape: torch.Size([1, 26921]) Labels shape: torch.Size([1, 26921]) Final batch size: 1, sequence length: 25880 Attention mask shape: torch.Size([1, 1, 25880, 25880]) Position ids shape: torch.Size([1, 25880]) Input IDs shape: torch.Size([1, 25880]) Labels shape: torch.Size([1, 25880]) Final batch size: 1, sequence length: 14496 Attention mask shape: torch.Size([1, 1, 14496, 14496]) Position ids shape: torch.Size([1, 14496]) Input IDs shape: torch.Size([1, 14496]) Labels shape: torch.Size([1, 14496]) Final batch size: 1, sequence length: 28523 Attention mask shape: torch.Size([1, 1, 28523, 28523]) Position ids shape: torch.Size([1, 28523]) Input IDs shape: torch.Size([1, 28523]) Labels shape: torch.Size([1, 28523]) Final batch size: 1, sequence length: 27633 Attention mask shape: torch.Size([1, 1, 27633, 27633]) Position ids shape: torch.Size([1, 27633]) Input IDs shape: torch.Size([1, 27633]) Labels shape: torch.Size([1, 27633]) Final batch size: 1, sequence length: 13031 Attention mask shape: torch.Size([1, 1, 13031, 13031]) Position ids shape: torch.Size([1, 13031]) Input IDs shape: torch.Size([1, 13031]) Labels shape: torch.Size([1, 13031]) Final batch size: 1, sequence length: 12224 Attention mask shape: torch.Size([1, 1, 12224, 12224]) Position ids shape: torch.Size([1, 12224]) Input IDs shape: torch.Size([1, 12224]) Labels shape: torch.Size([1, 12224]) Final batch size: 1, sequence length: 18470 Attention mask shape: torch.Size([1, 1, 18470, 18470]) Position ids shape: torch.Size([1, 18470]) Input IDs shape: torch.Size([1, 18470]) Labels shape: torch.Size([1, 18470]) Final batch size: 1, sequence length: 23338 Attention mask shape: torch.Size([1, 1, 23338, 23338]) Position ids shape: torch.Size([1, 23338]) Input IDs shape: torch.Size([1, 23338]) Labels shape: torch.Size([1, 23338]) Final batch size: 1, sequence length: 20184 Attention mask shape: torch.Size([1, 1, 20184, 20184]) Position ids shape: torch.Size([1, 20184]) Input IDs shape: torch.Size([1, 20184]) Labels shape: torch.Size([1, 20184]) Final batch size: 1, sequence length: 29404 Attention mask shape: torch.Size([1, 1, 29404, 29404]) Position ids shape: torch.Size([1, 29404]) Input IDs shape: torch.Size([1, 29404]) Labels shape: torch.Size([1, 29404]) Final batch size: 1, sequence length: 23975 Attention mask shape: torch.Size([1, 1, 23975, 23975]) Position ids shape: torch.Size([1, 23975]) Input IDs shape: torch.Size([1, 23975]) Labels shape: torch.Size([1, 23975]) Final batch size: 1, sequence length: 26138 Attention mask shape: torch.Size([1, 1, 26138, 26138]) Position ids shape: torch.Size([1, 26138]) Input IDs shape: torch.Size([1, 26138]) Labels shape: torch.Size([1, 26138]) Final batch size: 1, sequence length: 30592 Final batch size: 1, sequence length: 16057 Attention mask shape: torch.Size([1, 1, 30592, 30592]) Position ids shape: torch.Size([1, 30592]) Input IDs shape: torch.Size([1, 30592]) Labels shape: torch.Size([1, 30592]) Attention mask shape: torch.Size([1, 1, 16057, 16057]) Position ids shape: torch.Size([1, 16057]) Input IDs shape: torch.Size([1, 16057]) Labels shape: torch.Size([1, 16057]) Final batch size: 1, sequence length: 21766 Attention mask shape: torch.Size([1, 1, 21766, 21766]) Position ids shape: torch.Size([1, 21766]) Input IDs shape: torch.Size([1, 21766]) Labels shape: torch.Size([1, 21766]) Final batch size: 1, sequence length: 9704 Attention mask shape: torch.Size([1, 1, 9704, 9704]) Position ids shape: torch.Size([1, 9704]) Input IDs shape: torch.Size([1, 9704]) Labels shape: torch.Size([1, 9704]) Final batch size: 1, sequence length: 11515 Attention mask shape: torch.Size([1, 1, 11515, 11515]) Position ids shape: torch.Size([1, 11515]) Input IDs shape: torch.Size([1, 11515]) Labels shape: torch.Size([1, 11515]) Final batch size: 1, sequence length: 32473 Attention mask shape: torch.Size([1, 1, 32473, 32473]) Position ids shape: torch.Size([1, 32473]) Input IDs shape: torch.Size([1, 32473]) Labels shape: torch.Size([1, 32473]) Final batch size: 1, sequence length: 11608 Attention mask shape: torch.Size([1, 1, 11608, 11608]) Position ids shape: torch.Size([1, 11608]) Input IDs shape: torch.Size([1, 11608]) Labels shape: torch.Size([1, 11608]) Final batch size: 1, sequence length: 28897 Attention mask shape: torch.Size([1, 1, 28897, 28897]) Position ids shape: torch.Size([1, 28897]) Input IDs shape: torch.Size([1, 28897]) Labels shape: torch.Size([1, 28897]) Final batch size: 1, sequence length: 21672 Attention mask shape: torch.Size([1, 1, 21672, 21672]) Position ids shape: torch.Size([1, 21672]) Input IDs shape: torch.Size([1, 21672]) Labels shape: torch.Size([1, 21672]) Final batch size: 1, sequence length: 33482 Attention mask shape: torch.Size([1, 1, 33482, 33482]) Position ids shape: torch.Size([1, 33482]) Input IDs shape: torch.Size([1, 33482]) Labels shape: torch.Size([1, 33482]) Final batch size: 1, sequence length: 23558 Attention mask shape: torch.Size([1, 1, 23558, 23558]) Position ids shape: torch.Size([1, 23558]) Input IDs shape: torch.Size([1, 23558]) Labels shape: torch.Size([1, 23558]) Final batch size: 1, sequence length: 30965 Attention mask shape: torch.Size([1, 1, 30965, 30965]) Position ids shape: torch.Size([1, 30965]) Input IDs shape: torch.Size([1, 30965]) Labels shape: torch.Size([1, 30965]) Final batch size: 1, sequence length: 21581 Attention mask shape: torch.Size([1, 1, 21581, 21581]) Position ids shape: torch.Size([1, 21581]) Input IDs shape: torch.Size([1, 21581]) Labels shape: torch.Size([1, 21581]) Final batch size: 1, sequence length: 11403 Attention mask shape: torch.Size([1, 1, 11403, 11403]) Position ids shape: torch.Size([1, 11403]) Input IDs shape: torch.Size([1, 11403]) Labels shape: torch.Size([1, 11403]) Final batch size: 1, sequence length: 34469 Attention mask shape: torch.Size([1, 1, 34469, 34469]) Position ids shape: torch.Size([1, 34469]) Input IDs shape: torch.Size([1, 34469]) Labels shape: torch.Size([1, 34469]) Final batch size: 1, sequence length: 17456 Attention mask shape: torch.Size([1, 1, 17456, 17456]) Position ids shape: torch.Size([1, 17456]) Input IDs shape: torch.Size([1, 17456]) Labels shape: torch.Size([1, 17456]) Final batch size: 1, sequence length: 16257 Attention mask shape: torch.Size([1, 1, 16257, 16257]) Position ids shape: torch.Size([1, 16257]) Input IDs shape: torch.Size([1, 16257]) Labels shape: torch.Size([1, 16257]) Final batch size: 1, sequence length: 35485 Attention mask shape: torch.Size([1, 1, 35485, 35485]) Position ids shape: torch.Size([1, 35485]) Input IDs shape: torch.Size([1, 35485]) Labels shape: torch.Size([1, 35485]) Final batch size: 1, sequence length: 24121 Attention mask shape: torch.Size([1, 1, 24121, 24121]) Position ids shape: torch.Size([1, 24121]) Input IDs shape: torch.Size([1, 24121]) Labels shape: torch.Size([1, 24121]) Final batch size: 1, sequence length: 27702 Attention mask shape: torch.Size([1, 1, 27702, 27702]) Position ids shape: torch.Size([1, 27702]) Input IDs shape: torch.Size([1, 27702]) Labels shape: torch.Size([1, 27702]) Final batch size: 1, sequence length: 29481 Attention mask shape: torch.Size([1, 1, 29481, 29481]) Position ids shape: torch.Size([1, 29481]) Input IDs shape: torch.Size([1, 29481]) Labels shape: torch.Size([1, 29481]) Final batch size: 1, sequence length: 35100 Attention mask shape: torch.Size([1, 1, 35100, 35100]) Position ids shape: torch.Size([1, 35100]) Input IDs shape: torch.Size([1, 35100]) Labels shape: torch.Size([1, 35100]) Final batch size: 1, sequence length: 35030 Attention mask shape: torch.Size([1, 1, 35030, 35030]) Position ids shape: torch.Size([1, 35030]) Input IDs shape: torch.Size([1, 35030]) Labels shape: torch.Size([1, 35030]) Final batch size: 1, sequence length: 20198 Attention mask shape: torch.Size([1, 1, 20198, 20198]) Position ids shape: torch.Size([1, 20198]) Input IDs shape: torch.Size([1, 20198]) Labels shape: torch.Size([1, 20198]) Final batch size: 1, sequence length: 32596 Attention mask shape: torch.Size([1, 1, 32596, 32596]) Position ids shape: torch.Size([1, 32596]) Input IDs shape: torch.Size([1, 32596]) Labels shape: torch.Size([1, 32596]) Final batch size: 1, sequence length: 18126 Attention mask shape: torch.Size([1, 1, 18126, 18126]) Position ids shape: torch.Size([1, 18126]) Input IDs shape: torch.Size([1, 18126]) Labels shape: torch.Size([1, 18126]) Final batch size: 1, sequence length: 18023 Attention mask shape: torch.Size([1, 1, 18023, 18023]) Position ids shape: torch.Size([1, 18023]) Input IDs shape: torch.Size([1, 18023]) Labels shape: torch.Size([1, 18023]) Final batch size: 1, sequence length: 37040 Attention mask shape: torch.Size([1, 1, 37040, 37040]) Position ids shape: torch.Size([1, 37040]) Input IDs shape: torch.Size([1, 37040]) Labels shape: torch.Size([1, 37040]) Final batch size: 1, sequence length: 28773 Attention mask shape: torch.Size([1, 1, 28773, 28773]) Position ids shape: torch.Size([1, 28773]) Input IDs shape: torch.Size([1, 28773]) Labels shape: torch.Size([1, 28773]) Final batch size: 1, sequence length: 30428 Attention mask shape: torch.Size([1, 1, 30428, 30428]) Position ids shape: torch.Size([1, 30428]) Input IDs shape: torch.Size([1, 30428]) Labels shape: torch.Size([1, 30428]) Final batch size: 1, sequence length: 18454 Attention mask shape: torch.Size([1, 1, 18454, 18454]) Position ids shape: torch.Size([1, 18454]) Input IDs shape: torch.Size([1, 18454]) Labels shape: torch.Size([1, 18454]) Final batch size: 1, sequence length: 30772 Attention mask shape: torch.Size([1, 1, 30772, 30772]) Position ids shape: torch.Size([1, 30772]) Input IDs shape: torch.Size([1, 30772]) Labels shape: torch.Size([1, 30772]) Final batch size: 1, sequence length: 37980 Attention mask shape: torch.Size([1, 1, 37980, 37980]) Position ids shape: torch.Size([1, 37980]) Input IDs shape: torch.Size([1, 37980]) Labels shape: torch.Size([1, 37980]) Final batch size: 1, sequence length: 30072 Attention mask shape: torch.Size([1, 1, 30072, 30072]) Position ids shape: torch.Size([1, 30072]) Input IDs shape: torch.Size([1, 30072]) Labels shape: torch.Size([1, 30072]) Final batch size: 1, sequence length: 39589 Attention mask shape: torch.Size([1, 1, 39589, 39589]) Position ids shape: torch.Size([1, 39589]) Input IDs shape: torch.Size([1, 39589]) Labels shape: torch.Size([1, 39589]) Final batch size: 1, sequence length: 21491 Attention mask shape: torch.Size([1, 1, 21491, 21491]) Position ids shape: torch.Size([1, 21491]) Input IDs shape: torch.Size([1, 21491]) Labels shape: torch.Size([1, 21491]) Final batch size: 1, sequence length: 29537 Attention mask shape: torch.Size([1, 1, 29537, 29537]) Position ids shape: torch.Size([1, 29537]) Input IDs shape: torch.Size([1, 29537]) Labels shape: torch.Size([1, 29537]) Final batch size: 1, sequence length: 19620 Attention mask shape: torch.Size([1, 1, 19620, 19620]) Position ids shape: torch.Size([1, 19620]) Input IDs shape: torch.Size([1, 19620]) Labels shape: torch.Size([1, 19620]) Final batch size: 1, sequence length: 30789 Attention mask shape: torch.Size([1, 1, 30789, 30789]) Position ids shape: torch.Size([1, 30789]) Input IDs shape: torch.Size([1, 30789]) Labels shape: torch.Size([1, 30789]) Final batch size: 1, sequence length: 29464 Attention mask shape: torch.Size([1, 1, 29464, 29464]) Position ids shape: torch.Size([1, 29464]) Input IDs shape: torch.Size([1, 29464]) Labels shape: torch.Size([1, 29464]) Final batch size: 1, sequence length: 19538 Attention mask shape: torch.Size([1, 1, 19538, 19538]) Position ids shape: torch.Size([1, 19538]) Input IDs shape: torch.Size([1, 19538]) Labels shape: torch.Size([1, 19538]) Final batch size: 1, sequence length: 17870 Attention mask shape: torch.Size([1, 1, 17870, 17870]) Position ids shape: torch.Size([1, 17870]) Input IDs shape: torch.Size([1, 17870]) Labels shape: torch.Size([1, 17870]) Final batch size: 1, sequence length: 19702 Attention mask shape: torch.Size([1, 1, 19702, 19702]) Position ids shape: torch.Size([1, 19702]) Input IDs shape: torch.Size([1, 19702]) Labels shape: torch.Size([1, 19702]) Final batch size: 1, sequence length: 30859 Attention mask shape: torch.Size([1, 1, 30859, 30859]) Position ids shape: torch.Size([1, 30859]) Input IDs shape: torch.Size([1, 30859]) Labels shape: torch.Size([1, 30859]) Final batch size: 1, sequence length: 26635 Attention mask shape: torch.Size([1, 1, 26635, 26635]) Position ids shape: torch.Size([1, 26635]) Input IDs shape: torch.Size([1, 26635]) Labels shape: torch.Size([1, 26635]) Final batch size: 1, sequence length: 17811 Attention mask shape: torch.Size([1, 1, 17811, 17811]) Position ids shape: torch.Size([1, 17811]) Input IDs shape: torch.Size([1, 17811]) Labels shape: torch.Size([1, 17811]) Final batch size: 1, sequence length: 15830 Attention mask shape: torch.Size([1, 1, 15830, 15830]) Position ids shape: torch.Size([1, 15830]) Input IDs shape: torch.Size([1, 15830]) Labels shape: torch.Size([1, 15830]) Final batch size: 1, sequence length: 13622 Attention mask shape: torch.Size([1, 1, 13622, 13622]) Position ids shape: torch.Size([1, 13622]) Input IDs shape: torch.Size([1, 13622]) Labels shape: torch.Size([1, 13622]) Final batch size: 1, sequence length: 26708 Attention mask shape: torch.Size([1, 1, 26708, 26708]) Position ids shape: torch.Size([1, 26708]) Input IDs shape: torch.Size([1, 26708]) Labels shape: torch.Size([1, 26708]) Final batch size: 1, sequence length: 29875 Attention mask shape: torch.Size([1, 1, 29875, 29875]) Position ids shape: torch.Size([1, 29875]) Input IDs shape: torch.Size([1, 29875]) Labels shape: torch.Size([1, 29875]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 26215 Attention mask shape: torch.Size([1, 1, 26215, 26215]) Position ids shape: torch.Size([1, 26215]) Input IDs shape: torch.Size([1, 26215]) Labels shape: torch.Size([1, 26215]) Final batch size: 1, sequence length: 21936 Attention mask shape: torch.Size([1, 1, 21936, 21936]) Position ids shape: torch.Size([1, 21936]) Input IDs shape: torch.Size([1, 21936]) Labels shape: torch.Size([1, 21936]) Final batch size: 1, sequence length: 37992 Attention mask shape: torch.Size([1, 1, 37992, 37992]) Position ids shape: torch.Size([1, 37992]) Input IDs shape: torch.Size([1, 37992]) Labels shape: torch.Size([1, 37992]) Final batch size: 1, sequence length: 22309 Attention mask shape: torch.Size([1, 1, 22309, 22309]) Position ids shape: torch.Size([1, 22309]) Input IDs shape: torch.Size([1, 22309]) Labels shape: torch.Size([1, 22309]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40526 Attention mask shape: torch.Size([1, 1, 40526, 40526]) Position ids shape: torch.Size([1, 40526]) Input IDs shape: torch.Size([1, 40526]) Labels shape: torch.Size([1, 40526]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 33685 Attention mask shape: torch.Size([1, 1, 33685, 33685]) Position ids shape: torch.Size([1, 33685]) Input IDs shape: torch.Size([1, 33685]) Labels shape: torch.Size([1, 33685]) Final batch size: 1, sequence length: 14730 Attention mask shape: torch.Size([1, 1, 14730, 14730]) Position ids shape: torch.Size([1, 14730]) Input IDs shape: torch.Size([1, 14730]) Labels shape: torch.Size([1, 14730]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 38927 Attention mask shape: torch.Size([1, 1, 38927, 38927]) Position ids shape: torch.Size([1, 38927]) Input IDs shape: torch.Size([1, 38927]) Labels shape: torch.Size([1, 38927]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 36580 Attention mask shape: torch.Size([1, 1, 36580, 36580]) Position ids shape: torch.Size([1, 36580]) Input IDs shape: torch.Size([1, 36580]) Labels shape: torch.Size([1, 36580]) Final batch size: 1, sequence length: 17623 Attention mask shape: torch.Size([1, 1, 17623, 17623]) Position ids shape: torch.Size([1, 17623]) Input IDs shape: torch.Size([1, 17623]) Labels shape: torch.Size([1, 17623]) Final batch size: 1, sequence length: 18606 Attention mask shape: torch.Size([1, 1, 18606, 18606]) Position ids shape: torch.Size([1, 18606]) Input IDs shape: torch.Size([1, 18606]) Labels shape: torch.Size([1, 18606]) Final batch size: 1, sequence length: 37866 Attention mask shape: torch.Size([1, 1, 37866, 37866]) Position ids shape: torch.Size([1, 37866]) Input IDs shape: torch.Size([1, 37866]) Labels shape: torch.Size([1, 37866]) Final batch size: 1, sequence length: 21250 Attention mask shape: torch.Size([1, 1, 21250, 21250]) Position ids shape: torch.Size([1, 21250]) Input IDs shape: torch.Size([1, 21250]) Labels shape: torch.Size([1, 21250]) Final batch size: 1, sequence length: 28814 Attention mask shape: torch.Size([1, 1, 28814, 28814]) Position ids shape: torch.Size([1, 28814]) Input IDs shape: torch.Size([1, 28814]) Labels shape: torch.Size([1, 28814]) Final batch size: 1, sequence length: 15993 Attention mask shape: torch.Size([1, 1, 15993, 15993]) Position ids shape: torch.Size([1, 15993]) Input IDs shape: torch.Size([1, 15993]) Labels shape: torch.Size([1, 15993]) Final batch size: 1, sequence length: 22945 Attention mask shape: torch.Size([1, 1, 22945, 22945]) Position ids shape: torch.Size([1, 22945]) Input IDs shape: torch.Size([1, 22945]) Labels shape: torch.Size([1, 22945]) Final batch size: 1, sequence length: 34684 Attention mask shape: torch.Size([1, 1, 34684, 34684]) Position ids shape: torch.Size([1, 34684]) Input IDs shape: torch.Size([1, 34684]) Labels shape: torch.Size([1, 34684]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 24232 Attention mask shape: torch.Size([1, 1, 24232, 24232]) Position ids shape: torch.Size([1, 24232]) Input IDs shape: torch.Size([1, 24232]) Labels shape: torch.Size([1, 24232]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 35952 Attention mask shape: torch.Size([1, 1, 35952, 35952]) Position ids shape: torch.Size([1, 35952]) Input IDs shape: torch.Size([1, 35952]) Labels shape: torch.Size([1, 35952]) Final batch size: 1, sequence length: 29639 Attention mask shape: torch.Size([1, 1, 29639, 29639]) Position ids shape: torch.Size([1, 29639]) Input IDs shape: torch.Size([1, 29639]) Labels shape: torch.Size([1, 29639]) Final batch size: 1, sequence length: 31745 Attention mask shape: torch.Size([1, 1, 31745, 31745]) Position ids shape: torch.Size([1, 31745]) Input IDs shape: torch.Size([1, 31745]) Labels shape: torch.Size([1, 31745]) Final batch size: 1, sequence length: 37407 Attention mask shape: torch.Size([1, 1, 37407, 37407]) Position ids shape: torch.Size([1, 37407]) Input IDs shape: torch.Size([1, 37407]) Labels shape: torch.Size([1, 37407]) Final batch size: 1, sequence length: 16270 Attention mask shape: torch.Size([1, 1, 16270, 16270]) Position ids shape: torch.Size([1, 16270]) Input IDs shape: torch.Size([1, 16270]) Labels shape: torch.Size([1, 16270]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 35077 Attention mask shape: torch.Size([1, 1, 35077, 35077]) Position ids shape: torch.Size([1, 35077]) Input IDs shape: torch.Size([1, 35077]) Labels shape: torch.Size([1, 35077]) Final batch size: 1, sequence length: 36716 Attention mask shape: torch.Size([1, 1, 36716, 36716]) Position ids shape: torch.Size([1, 36716]) Input IDs shape: torch.Size([1, 36716]) Labels shape: torch.Size([1, 36716]) Final batch size: 1, sequence length: 28810 Attention mask shape: torch.Size([1, 1, 28810, 28810]) Position ids shape: torch.Size([1, 28810]) Input IDs shape: torch.Size([1, 28810]) Labels shape: torch.Size([1, 28810]) Final batch size: 1, sequence length: 19028 Attention mask shape: torch.Size([1, 1, 19028, 19028]) Position ids shape: torch.Size([1, 19028]) Input IDs shape: torch.Size([1, 19028]) Labels shape: torch.Size([1, 19028]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 25388 Attention mask shape: torch.Size([1, 1, 25388, 25388]) Position ids shape: torch.Size([1, 25388]) Input IDs shape: torch.Size([1, 25388]) Labels shape: torch.Size([1, 25388]) Final batch size: 1, sequence length: 17778 Attention mask shape: torch.Size([1, 1, 17778, 17778]) Position ids shape: torch.Size([1, 17778]) Input IDs shape: torch.Size([1, 17778]) Labels shape: torch.Size([1, 17778]) Final batch size: 1, sequence length: 14104 Attention mask shape: torch.Size([1, 1, 14104, 14104]) Position ids shape: torch.Size([1, 14104]) Input IDs shape: torch.Size([1, 14104]) Labels shape: torch.Size([1, 14104]) Final batch size: 1, sequence length: 26930 Attention mask shape: torch.Size([1, 1, 26930, 26930]) Position ids shape: torch.Size([1, 26930]) Input IDs shape: torch.Size([1, 26930]) Labels shape: torch.Size([1, 26930]) Final batch size: 1, sequence length: 39661 Attention mask shape: torch.Size([1, 1, 39661, 39661]) Position ids shape: torch.Size([1, 39661]) Input IDs shape: torch.Size([1, 39661]) Labels shape: torch.Size([1, 39661]) Final batch size: 1, sequence length: 30135 Attention mask shape: torch.Size([1, 1, 30135, 30135]) Position ids shape: torch.Size([1, 30135]) Input IDs shape: torch.Size([1, 30135]) Labels shape: torch.Size([1, 30135]) Final batch size: 1, sequence length: 20057 Attention mask shape: torch.Size([1, 1, 20057, 20057]) Position ids shape: torch.Size([1, 20057]) Input IDs shape: torch.Size([1, 20057]) Labels shape: torch.Size([1, 20057]) Final batch size: 1, sequence length: 21071 Attention mask shape: torch.Size([1, 1, 21071, 21071]) Position ids shape: torch.Size([1, 21071]) Input IDs shape: torch.Size([1, 21071]) Labels shape: torch.Size([1, 21071]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 20765 Attention mask shape: torch.Size([1, 1, 20765, 20765]) Position ids shape: torch.Size([1, 20765]) Input IDs shape: torch.Size([1, 20765]) Labels shape: torch.Size([1, 20765]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 17971 Attention mask shape: torch.Size([1, 1, 17971, 17971]) Position ids shape: torch.Size([1, 17971]) Input IDs shape: torch.Size([1, 17971]) Labels shape: torch.Size([1, 17971]) Final batch size: 1, sequence length: 21006 Attention mask shape: torch.Size([1, 1, 21006, 21006]) Position ids shape: torch.Size([1, 21006]) Input IDs shape: torch.Size([1, 21006]) Labels shape: torch.Size([1, 21006]) Final batch size: 1, sequence length: 31381 Attention mask shape: torch.Size([1, 1, 31381, 31381]) Position ids shape: torch.Size([1, 31381]) Input IDs shape: torch.Size([1, 31381]) Labels shape: torch.Size([1, 31381]) Final batch size: 1, sequence length: 26665 Attention mask shape: torch.Size([1, 1, 26665, 26665]) Position ids shape: torch.Size([1, 26665]) Input IDs shape: torch.Size([1, 26665]) Labels shape: torch.Size([1, 26665]) Final batch size: 1, sequence length: 31420 Attention mask shape: torch.Size([1, 1, 31420, 31420]) Position ids shape: torch.Size([1, 31420]) Input IDs shape: torch.Size([1, 31420]) Labels shape: torch.Size([1, 31420]) Final batch size: 1, sequence length: 27281 Attention mask shape: torch.Size([1, 1, 27281, 27281]) Position ids shape: torch.Size([1, 27281]) Input IDs shape: torch.Size([1, 27281]) Labels shape: torch.Size([1, 27281]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 13903 Attention mask shape: torch.Size([1, 1, 13903, 13903]) Position ids shape: torch.Size([1, 13903]) Input IDs shape: torch.Size([1, 13903]) Labels shape: torch.Size([1, 13903]) Final batch size: 1, sequence length: 37397 Attention mask shape: torch.Size([1, 1, 37397, 37397]) Position ids shape: torch.Size([1, 37397]) Input IDs shape: torch.Size([1, 37397]) Labels shape: torch.Size([1, 37397]) Final batch size: 1, sequence length: 21805 Attention mask shape: torch.Size([1, 1, 21805, 21805]) Position ids shape: torch.Size([1, 21805]) Input IDs shape: torch.Size([1, 21805]) Labels shape: torch.Size([1, 21805]) Final batch size: 1, sequence length: 25032 Attention mask shape: torch.Size([1, 1, 25032, 25032]) Position ids shape: torch.Size([1, 25032]) Input IDs shape: torch.Size([1, 25032]) Labels shape: torch.Size([1, 25032]) Final batch size: 1, sequence length: 40960 Final batch size: 1, sequence length: 32024 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Attention mask shape: torch.Size([1, 1, 32024, 32024]) Position ids shape: torch.Size([1, 32024]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 32024]) Labels shape: torch.Size([1, 32024]) Final batch size: 1, sequence length: 23187 Attention mask shape: torch.Size([1, 1, 23187, 23187]) Position ids shape: torch.Size([1, 23187]) Input IDs shape: torch.Size([1, 23187]) Labels shape: torch.Size([1, 23187]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 18566 Attention mask shape: torch.Size([1, 1, 18566, 18566]) Position ids shape: torch.Size([1, 18566]) Input IDs shape: torch.Size([1, 18566]) Labels shape: torch.Size([1, 18566]) Final batch size: 1, sequence length: 35879 Attention mask shape: torch.Size([1, 1, 35879, 35879]) Position ids shape: torch.Size([1, 35879]) Input IDs shape: torch.Size([1, 35879]) Labels shape: torch.Size([1, 35879]) Final batch size: 1, sequence length: 39778 Attention mask shape: torch.Size([1, 1, 39778, 39778]) Position ids shape: torch.Size([1, 39778]) Input IDs shape: torch.Size([1, 39778]) Labels shape: torch.Size([1, 39778]) Final batch size: 1, sequence length: 40579 Attention mask shape: torch.Size([1, 1, 40579, 40579]) Position ids shape: torch.Size([1, 40579]) Input IDs shape: torch.Size([1, 40579]) Labels shape: torch.Size([1, 40579]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 36415 Attention mask shape: torch.Size([1, 1, 36415, 36415]) Position ids shape: torch.Size([1, 36415]) Input IDs shape: torch.Size([1, 36415]) Labels shape: torch.Size([1, 36415]) Final batch size: 1, sequence length: 12200 Attention mask shape: torch.Size([1, 1, 12200, 12200]) Position ids shape: torch.Size([1, 12200]) Input IDs shape: torch.Size([1, 12200]) Labels shape: torch.Size([1, 12200]) Final batch size: 1, sequence length: 39587 Attention mask shape: torch.Size([1, 1, 39587, 39587]) Position ids shape: torch.Size([1, 39587]) Input IDs shape: torch.Size([1, 39587]) Labels shape: torch.Size([1, 39587]) Final batch size: 1, sequence length: 37890 Attention mask shape: torch.Size([1, 1, 37890, 37890]) Position ids shape: torch.Size([1, 37890]) Input IDs shape: torch.Size([1, 37890]) Labels shape: torch.Size([1, 37890]) Final batch size: 1, sequence length: 32541 Attention mask shape: torch.Size([1, 1, 32541, 32541]) Position ids shape: torch.Size([1, 32541]) Input IDs shape: torch.Size([1, 32541]) Labels shape: torch.Size([1, 32541]) Final batch size: 1, sequence length: 37530 Attention mask shape: torch.Size([1, 1, 37530, 37530]) Position ids shape: torch.Size([1, 37530]) Input IDs shape: torch.Size([1, 37530]) Labels shape: torch.Size([1, 37530]) Final batch size: 1, sequence length: 19668 Attention mask shape: torch.Size([1, 1, 19668, 19668]) Position ids shape: torch.Size([1, 19668]) Input IDs shape: torch.Size([1, 19668]) Labels shape: torch.Size([1, 19668]) Final batch size: 1, sequence length: 25383 Attention mask shape: torch.Size([1, 1, 25383, 25383]) Position ids shape: torch.Size([1, 25383]) Input IDs shape: torch.Size([1, 25383]) Labels shape: torch.Size([1, 25383]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 14308 Attention mask shape: torch.Size([1, 1, 14308, 14308]) Position ids shape: torch.Size([1, 14308]) Input IDs shape: torch.Size([1, 14308]) Labels shape: torch.Size([1, 14308]) Final batch size: 1, sequence length: 39579 Attention mask shape: torch.Size([1, 1, 39579, 39579]) Position ids shape: torch.Size([1, 39579]) Input IDs shape: torch.Size([1, 39579]) Labels shape: torch.Size([1, 39579]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 36647 Attention mask shape: torch.Size([1, 1, 36647, 36647]) Position ids shape: torch.Size([1, 36647]) Input IDs shape: torch.Size([1, 36647]) Labels shape: torch.Size([1, 36647]) Final batch size: 1, sequence length: 9251 Attention mask shape: torch.Size([1, 1, 9251, 9251]) Position ids shape: torch.Size([1, 9251]) Input IDs shape: torch.Size([1, 9251]) Labels shape: torch.Size([1, 9251]) {'loss': 0.2335, 'grad_norm': 0.25569555733730304, 'learning_rate': 1.6543469682057105e-06, 'num_tokens': -inf, 'epoch': 6.12} Final batch size: 1, sequence length: 4858 Attention mask shape: torch.Size([1, 1, 4858, 4858]) Position ids shape: torch.Size([1, 4858]) Input IDs shape: torch.Size([1, 4858]) Labels shape: torch.Size([1, 4858]) Final batch size: 1, sequence length: 6316 Attention mask shape: torch.Size([1, 1, 6316, 6316]) Position ids shape: torch.Size([1, 6316]) Input IDs shape: torch.Size([1, 6316]) Labels shape: torch.Size([1, 6316]) Final batch size: 1, sequence length: 7360 Attention mask shape: torch.Size([1, 1, 7360, 7360]) Position ids shape: torch.Size([1, 7360]) Input IDs shape: torch.Size([1, 7360]) Labels shape: torch.Size([1, 7360]) Final batch size: 1, sequence length: 11448 Attention mask shape: torch.Size([1, 1, 11448, 11448]) Position ids shape: torch.Size([1, 11448]) Input IDs shape: torch.Size([1, 11448]) Labels shape: torch.Size([1, 11448]) Final batch size: 1, sequence length: 12846 Attention mask shape: torch.Size([1, 1, 12846, 12846]) Position ids shape: torch.Size([1, 12846]) Input IDs shape: torch.Size([1, 12846]) Labels shape: torch.Size([1, 12846]) Final batch size: 1, sequence length: 12075 Attention mask shape: torch.Size([1, 1, 12075, 12075]) Position ids shape: torch.Size([1, 12075]) Input IDs shape: torch.Size([1, 12075]) Labels shape: torch.Size([1, 12075]) Final batch size: 1, sequence length: 12945 Attention mask shape: torch.Size([1, 1, 12945, 12945]) Position ids shape: torch.Size([1, 12945]) Input IDs shape: torch.Size([1, 12945]) Labels shape: torch.Size([1, 12945]) Final batch size: 1, sequence length: 14330 Attention mask shape: torch.Size([1, 1, 14330, 14330]) Position ids shape: torch.Size([1, 14330]) Input IDs shape: torch.Size([1, 14330]) Labels shape: torch.Size([1, 14330]) Final batch size: 1, sequence length: 14891 Attention mask shape: torch.Size([1, 1, 14891, 14891]) Position ids shape: torch.Size([1, 14891]) Input IDs shape: torch.Size([1, 14891]) Labels shape: torch.Size([1, 14891]) Final batch size: 1, sequence length: 15189 Attention mask shape: torch.Size([1, 1, 15189, 15189]) Position ids shape: torch.Size([1, 15189]) Input IDs shape: torch.Size([1, 15189]) Labels shape: torch.Size([1, 15189]) Final batch size: 1, sequence length: 16391 Attention mask shape: torch.Size([1, 1, 16391, 16391]) Position ids shape: torch.Size([1, 16391]) Input IDs shape: torch.Size([1, 16391]) Labels shape: torch.Size([1, 16391]) Final batch size: 1, sequence length: 16658 Attention mask shape: torch.Size([1, 1, 16658, 16658]) Position ids shape: torch.Size([1, 16658]) Input IDs shape: torch.Size([1, 16658]) Labels shape: torch.Size([1, 16658]) Final batch size: 1, sequence length: 17246 Attention mask shape: torch.Size([1, 1, 17246, 17246]) Position ids shape: torch.Size([1, 17246]) Input IDs shape: torch.Size([1, 17246]) Labels shape: torch.Size([1, 17246]) Final batch size: 1, sequence length: 16961 Attention mask shape: torch.Size([1, 1, 16961, 16961]) Position ids shape: torch.Size([1, 16961]) Input IDs shape: torch.Size([1, 16961]) Labels shape: torch.Size([1, 16961]) Final batch size: 1, sequence length: 16326 Attention mask shape: torch.Size([1, 1, 16326, 16326]) Position ids shape: torch.Size([1, 16326]) Input IDs shape: torch.Size([1, 16326]) Labels shape: torch.Size([1, 16326]) Final batch size: 1, sequence length: 18341 Attention mask shape: torch.Size([1, 1, 18341, 18341]) Position ids shape: torch.Size([1, 18341]) Input IDs shape: torch.Size([1, 18341]) Labels shape: torch.Size([1, 18341]) Final batch size: 1, sequence length: 18400 Attention mask shape: torch.Size([1, 1, 18400, 18400]) Position ids shape: torch.Size([1, 18400]) Input IDs shape: torch.Size([1, 18400]) Labels shape: torch.Size([1, 18400]) Final batch size: 1, sequence length: 16536 Attention mask shape: torch.Size([1, 1, 16536, 16536]) Position ids shape: torch.Size([1, 16536]) Input IDs shape: torch.Size([1, 16536]) Labels shape: torch.Size([1, 16536]) Final batch size: 1, sequence length: 18014 Attention mask shape: torch.Size([1, 1, 18014, 18014]) Position ids shape: torch.Size([1, 18014]) Input IDs shape: torch.Size([1, 18014]) Labels shape: torch.Size([1, 18014]) Final batch size: 1, sequence length: 19332 Attention mask shape: torch.Size([1, 1, 19332, 19332]) Position ids shape: torch.Size([1, 19332]) Input IDs shape: torch.Size([1, 19332]) Labels shape: torch.Size([1, 19332]) Final batch size: 1, sequence length: 19597 Attention mask shape: torch.Size([1, 1, 19597, 19597]) Position ids shape: torch.Size([1, 19597]) Input IDs shape: torch.Size([1, 19597]) Labels shape: torch.Size([1, 19597]) Final batch size: 1, sequence length: 19999 Attention mask shape: torch.Size([1, 1, 19999, 19999]) Position ids shape: torch.Size([1, 19999]) Input IDs shape: torch.Size([1, 19999]) Labels shape: torch.Size([1, 19999]) Final batch size: 1, sequence length: 21122 Attention mask shape: torch.Size([1, 1, 21122, 21122]) Position ids shape: torch.Size([1, 21122]) Input IDs shape: torch.Size([1, 21122]) Labels shape: torch.Size([1, 21122]) Final batch size: 1, sequence length: 18424 Attention mask shape: torch.Size([1, 1, 18424, 18424]) Position ids shape: torch.Size([1, 18424]) Input IDs shape: torch.Size([1, 18424]) Labels shape: torch.Size([1, 18424]) Final batch size: 1, sequence length: 20118 Attention mask shape: torch.Size([1, 1, 20118, 20118]) Position ids shape: torch.Size([1, 20118]) Input IDs shape: torch.Size([1, 20118]) Labels shape: torch.Size([1, 20118]) Final batch size: 1, sequence length: 21617 Attention mask shape: torch.Size([1, 1, 21617, 21617]) Position ids shape: torch.Size([1, 21617]) Input IDs shape: torch.Size([1, 21617]) Labels shape: torch.Size([1, 21617]) Final batch size: 1, sequence length: 16831 Attention mask shape: torch.Size([1, 1, 16831, 16831]) Position ids shape: torch.Size([1, 16831]) Input IDs shape: torch.Size([1, 16831]) Labels shape: torch.Size([1, 16831]) Final batch size: 1, sequence length: 18393 Attention mask shape: torch.Size([1, 1, 18393, 18393]) Position ids shape: torch.Size([1, 18393]) Input IDs shape: torch.Size([1, 18393]) Labels shape: torch.Size([1, 18393]) Final batch size: 1, sequence length: 20912 Attention mask shape: torch.Size([1, 1, 20912, 20912]) Position ids shape: torch.Size([1, 20912]) Input IDs shape: torch.Size([1, 20912]) Labels shape: torch.Size([1, 20912]) Final batch size: 1, sequence length: 22854 Attention mask shape: torch.Size([1, 1, 22854, 22854]) Position ids shape: torch.Size([1, 22854]) Input IDs shape: torch.Size([1, 22854]) Labels shape: torch.Size([1, 22854]) Final batch size: 1, sequence length: 24597 Attention mask shape: torch.Size([1, 1, 24597, 24597]) Position ids shape: torch.Size([1, 24597]) Input IDs shape: torch.Size([1, 24597]) Labels shape: torch.Size([1, 24597]) Final batch size: 1, sequence length: 20888 Attention mask shape: torch.Size([1, 1, 20888, 20888]) Position ids shape: torch.Size([1, 20888]) Input IDs shape: torch.Size([1, 20888]) Labels shape: torch.Size([1, 20888]) Final batch size: 1, sequence length: 21314 Attention mask shape: torch.Size([1, 1, 21314, 21314]) Position ids shape: torch.Size([1, 21314]) Input IDs shape: torch.Size([1, 21314]) Labels shape: torch.Size([1, 21314]) Final batch size: 1, sequence length: 12328 Attention mask shape: torch.Size([1, 1, 12328, 12328]) Position ids shape: torch.Size([1, 12328]) Input IDs shape: torch.Size([1, 12328]) Labels shape: torch.Size([1, 12328]) Final batch size: 1, sequence length: 21611 Attention mask shape: torch.Size([1, 1, 21611, 21611]) Position ids shape: torch.Size([1, 21611]) Input IDs shape: torch.Size([1, 21611]) Labels shape: torch.Size([1, 21611]) Final batch size: 1, sequence length: 6839 Attention mask shape: torch.Size([1, 1, 6839, 6839]) Position ids shape: torch.Size([1, 6839]) Input IDs shape: torch.Size([1, 6839]) Labels shape: torch.Size([1, 6839]) Final batch size: 1, sequence length: 9341 Attention mask shape: torch.Size([1, 1, 9341, 9341]) Position ids shape: torch.Size([1, 9341]) Input IDs shape: torch.Size([1, 9341]) Labels shape: torch.Size([1, 9341]) Final batch size: 1, sequence length: 13638 Attention mask shape: torch.Size([1, 1, 13638, 13638]) Position ids shape: torch.Size([1, 13638]) Input IDs shape: torch.Size([1, 13638]) Labels shape: torch.Size([1, 13638]) Final batch size: 1, sequence length: 21739 Attention mask shape: torch.Size([1, 1, 21739, 21739]) Position ids shape: torch.Size([1, 21739]) Input IDs shape: torch.Size([1, 21739]) Labels shape: torch.Size([1, 21739]) Final batch size: 1, sequence length: 25042 Attention mask shape: torch.Size([1, 1, 25042, 25042]) Position ids shape: torch.Size([1, 25042]) Input IDs shape: torch.Size([1, 25042]) Labels shape: torch.Size([1, 25042]) Final batch size: 1, sequence length: 25909 Attention mask shape: torch.Size([1, 1, 25909, 25909]) Position ids shape: torch.Size([1, 25909]) Input IDs shape: torch.Size([1, 25909]) Labels shape: torch.Size([1, 25909]) Final batch size: 1, sequence length: 25575 Attention mask shape: torch.Size([1, 1, 25575, 25575]) Position ids shape: torch.Size([1, 25575]) Input IDs shape: torch.Size([1, 25575]) Labels shape: torch.Size([1, 25575]) Final batch size: 1, sequence length: 20619 Attention mask shape: torch.Size([1, 1, 20619, 20619]) Position ids shape: torch.Size([1, 20619]) Input IDs shape: torch.Size([1, 20619]) Labels shape: torch.Size([1, 20619]) Final batch size: 1, sequence length: 20702 Attention mask shape: torch.Size([1, 1, 20702, 20702]) Position ids shape: torch.Size([1, 20702]) Input IDs shape: torch.Size([1, 20702]) Labels shape: torch.Size([1, 20702]) Final batch size: 1, sequence length: 21725 Attention mask shape: torch.Size([1, 1, 21725, 21725]) Position ids shape: torch.Size([1, 21725]) Input IDs shape: torch.Size([1, 21725]) Labels shape: torch.Size([1, 21725]) Final batch size: 1, sequence length: 26218 Attention mask shape: torch.Size([1, 1, 26218, 26218]) Position ids shape: torch.Size([1, 26218]) Input IDs shape: torch.Size([1, 26218]) Labels shape: torch.Size([1, 26218]) Final batch size: 1, sequence length: 21677 Attention mask shape: torch.Size([1, 1, 21677, 21677]) Position ids shape: torch.Size([1, 21677]) Input IDs shape: torch.Size([1, 21677]) Labels shape: torch.Size([1, 21677]) Final batch size: 1, sequence length: 28348 Attention mask shape: torch.Size([1, 1, 28348, 28348]) Position ids shape: torch.Size([1, 28348]) Input IDs shape: torch.Size([1, 28348]) Labels shape: torch.Size([1, 28348]) Final batch size: 1, sequence length: 15232 Attention mask shape: torch.Size([1, 1, 15232, 15232]) Position ids shape: torch.Size([1, 15232]) Input IDs shape: torch.Size([1, 15232]) Labels shape: torch.Size([1, 15232]) Final batch size: 1, sequence length: 15026 Attention mask shape: torch.Size([1, 1, 15026, 15026]) Position ids shape: torch.Size([1, 15026]) Input IDs shape: torch.Size([1, 15026]) Labels shape: torch.Size([1, 15026]) Final batch size: 1, sequence length: 16611 Attention mask shape: torch.Size([1, 1, 16611, 16611]) Position ids shape: torch.Size([1, 16611]) Input IDs shape: torch.Size([1, 16611]) Labels shape: torch.Size([1, 16611]) Final batch size: 1, sequence length: 13646 Attention mask shape: torch.Size([1, 1, 13646, 13646]) Position ids shape: torch.Size([1, 13646]) Input IDs shape: torch.Size([1, 13646]) Labels shape: torch.Size([1, 13646]) Final batch size: 1, sequence length: 29730 Attention mask shape: torch.Size([1, 1, 29730, 29730]) Position ids shape: torch.Size([1, 29730]) Input IDs shape: torch.Size([1, 29730]) Labels shape: torch.Size([1, 29730]) Final batch size: 1, sequence length: 28678 Attention mask shape: torch.Size([1, 1, 28678, 28678]) Position ids shape: torch.Size([1, 28678]) Input IDs shape: torch.Size([1, 28678]) Labels shape: torch.Size([1, 28678]) Final batch size: 1, sequence length: 19885 Attention mask shape: torch.Size([1, 1, 19885, 19885]) Position ids shape: torch.Size([1, 19885]) Input IDs shape: torch.Size([1, 19885]) Labels shape: torch.Size([1, 19885]) Final batch size: 1, sequence length: 23614 Attention mask shape: torch.Size([1, 1, 23614, 23614]) Position ids shape: torch.Size([1, 23614]) Input IDs shape: torch.Size([1, 23614]) Labels shape: torch.Size([1, 23614]) Final batch size: 1, sequence length: 24090 Attention mask shape: torch.Size([1, 1, 24090, 24090]) Position ids shape: torch.Size([1, 24090]) Input IDs shape: torch.Size([1, 24090]) Labels shape: torch.Size([1, 24090]) Final batch size: 1, sequence length: 22217 Attention mask shape: torch.Size([1, 1, 22217, 22217]) Position ids shape: torch.Size([1, 22217]) Input IDs shape: torch.Size([1, 22217]) Labels shape: torch.Size([1, 22217]) Final batch size: 1, sequence length: 24342 Attention mask shape: torch.Size([1, 1, 24342, 24342]) Position ids shape: torch.Size([1, 24342]) Input IDs shape: torch.Size([1, 24342]) Labels shape: torch.Size([1, 24342]) Final batch size: 1, sequence length: 21623 Attention mask shape: torch.Size([1, 1, 21623, 21623]) Position ids shape: torch.Size([1, 21623]) Input IDs shape: torch.Size([1, 21623]) Labels shape: torch.Size([1, 21623]) Final batch size: 1, sequence length: 32034 Attention mask shape: torch.Size([1, 1, 32034, 32034]) Position ids shape: torch.Size([1, 32034]) Input IDs shape: torch.Size([1, 32034]) Labels shape: torch.Size([1, 32034]) Final batch size: 1, sequence length: 16403 Attention mask shape: torch.Size([1, 1, 16403, 16403]) Position ids shape: torch.Size([1, 16403]) Input IDs shape: torch.Size([1, 16403]) Labels shape: torch.Size([1, 16403]) Final batch size: 1, sequence length: 30766 Attention mask shape: torch.Size([1, 1, 30766, 30766]) Position ids shape: torch.Size([1, 30766]) Input IDs shape: torch.Size([1, 30766]) Labels shape: torch.Size([1, 30766]) Final batch size: 1, sequence length: 17780 Attention mask shape: torch.Size([1, 1, 17780, 17780]) Position ids shape: torch.Size([1, 17780]) Input IDs shape: torch.Size([1, 17780]) Labels shape: torch.Size([1, 17780]) Final batch size: 1, sequence length: 20915 Attention mask shape: torch.Size([1, 1, 20915, 20915]) Position ids shape: torch.Size([1, 20915]) Input IDs shape: torch.Size([1, 20915]) Labels shape: torch.Size([1, 20915]) Final batch size: 1, sequence length: 27795 Attention mask shape: torch.Size([1, 1, 27795, 27795]) Position ids shape: torch.Size([1, 27795]) Input IDs shape: torch.Size([1, 27795]) Labels shape: torch.Size([1, 27795]) Final batch size: 1, sequence length: 30917 Attention mask shape: torch.Size([1, 1, 30917, 30917]) Position ids shape: torch.Size([1, 30917]) Input IDs shape: torch.Size([1, 30917]) Labels shape: torch.Size([1, 30917]) Final batch size: 1, sequence length: 30366 Attention mask shape: torch.Size([1, 1, 30366, 30366]) Position ids shape: torch.Size([1, 30366]) Input IDs shape: torch.Size([1, 30366]) Labels shape: torch.Size([1, 30366]) Final batch size: 1, sequence length: 20028 Attention mask shape: torch.Size([1, 1, 20028, 20028]) Position ids shape: torch.Size([1, 20028]) Input IDs shape: torch.Size([1, 20028]) Labels shape: torch.Size([1, 20028]) Final batch size: 1, sequence length: 25698 Attention mask shape: torch.Size([1, 1, 25698, 25698]) Position ids shape: torch.Size([1, 25698]) Input IDs shape: torch.Size([1, 25698]) Labels shape: torch.Size([1, 25698]) Final batch size: 1, sequence length: 30410 Attention mask shape: torch.Size([1, 1, 30410, 30410]) Position ids shape: torch.Size([1, 30410]) Input IDs shape: torch.Size([1, 30410]) Labels shape: torch.Size([1, 30410]) Final batch size: 1, sequence length: 34802 Attention mask shape: torch.Size([1, 1, 34802, 34802]) Position ids shape: torch.Size([1, 34802]) Input IDs shape: torch.Size([1, 34802]) Labels shape: torch.Size([1, 34802]) Final batch size: 1, sequence length: 35781 Attention mask shape: torch.Size([1, 1, 35781, 35781]) Position ids shape: torch.Size([1, 35781]) Input IDs shape: torch.Size([1, 35781]) Labels shape: torch.Size([1, 35781]) Final batch size: 1, sequence length: 30796 Attention mask shape: torch.Size([1, 1, 30796, 30796]) Position ids shape: torch.Size([1, 30796]) Input IDs shape: torch.Size([1, 30796]) Labels shape: torch.Size([1, 30796]) Final batch size: 1, sequence length: 21562 Attention mask shape: torch.Size([1, 1, 21562, 21562]) Position ids shape: torch.Size([1, 21562]) Input IDs shape: torch.Size([1, 21562]) Labels shape: torch.Size([1, 21562]) Final batch size: 1, sequence length: 35628 Attention mask shape: torch.Size([1, 1, 35628, 35628]) Position ids shape: torch.Size([1, 35628]) Input IDs shape: torch.Size([1, 35628]) Labels shape: torch.Size([1, 35628]) Final batch size: 1, sequence length: 38190 Attention mask shape: torch.Size([1, 1, 38190, 38190]) Position ids shape: torch.Size([1, 38190]) Input IDs shape: torch.Size([1, 38190]) Labels shape: torch.Size([1, 38190]) Final batch size: 1, sequence length: 22768 Attention mask shape: torch.Size([1, 1, 22768, 22768]) Position ids shape: torch.Size([1, 22768]) Input IDs shape: torch.Size([1, 22768]) Labels shape: torch.Size([1, 22768]) Final batch size: 1, sequence length: 15726 Attention mask shape: torch.Size([1, 1, 15726, 15726]) Position ids shape: torch.Size([1, 15726]) Input IDs shape: torch.Size([1, 15726]) Labels shape: torch.Size([1, 15726]) Final batch size: 1, sequence length: 32308 Attention mask shape: torch.Size([1, 1, 32308, 32308]) Position ids shape: torch.Size([1, 32308]) Input IDs shape: torch.Size([1, 32308]) Labels shape: torch.Size([1, 32308]) Final batch size: 1, sequence length: 37195 Attention mask shape: torch.Size([1, 1, 37195, 37195]) Position ids shape: torch.Size([1, 37195]) Input IDs shape: torch.Size([1, 37195]) Labels shape: torch.Size([1, 37195]) Final batch size: 1, sequence length: 35697 Attention mask shape: torch.Size([1, 1, 35697, 35697]) Position ids shape: torch.Size([1, 35697]) Input IDs shape: torch.Size([1, 35697]) Labels shape: torch.Size([1, 35697]) Final batch size: 1, sequence length: 32239 Attention mask shape: torch.Size([1, 1, 32239, 32239]) Position ids shape: torch.Size([1, 32239]) Input IDs shape: torch.Size([1, 32239]) Labels shape: torch.Size([1, 32239]) Final batch size: 1, sequence length: 32630 Attention mask shape: torch.Size([1, 1, 32630, 32630]) Position ids shape: torch.Size([1, 32630]) Input IDs shape: torch.Size([1, 32630]) Labels shape: torch.Size([1, 32630]) Final batch size: 1, sequence length: 38420 Attention mask shape: torch.Size([1, 1, 38420, 38420]) Position ids shape: torch.Size([1, 38420]) Input IDs shape: torch.Size([1, 38420]) Labels shape: torch.Size([1, 38420]) Final batch size: 1, sequence length: 23626 Attention mask shape: torch.Size([1, 1, 23626, 23626]) Position ids shape: torch.Size([1, 23626]) Input IDs shape: torch.Size([1, 23626]) Labels shape: torch.Size([1, 23626]) Final batch size: 1, sequence length: 39608 Attention mask shape: torch.Size([1, 1, 39608, 39608]) Position ids shape: torch.Size([1, 39608]) Input IDs shape: torch.Size([1, 39608]) Labels shape: torch.Size([1, 39608]) Final batch size: 1, sequence length: 22199 Attention mask shape: torch.Size([1, 1, 22199, 22199]) Position ids shape: torch.Size([1, 22199]) Input IDs shape: torch.Size([1, 22199]) Labels shape: torch.Size([1, 22199]) Final batch size: 1, sequence length: 38362 Attention mask shape: torch.Size([1, 1, 38362, 38362]) Position ids shape: torch.Size([1, 38362]) Input IDs shape: torch.Size([1, 38362]) Labels shape: torch.Size([1, 38362]) Final batch size: 1, sequence length: 31143 Attention mask shape: torch.Size([1, 1, 31143, 31143]) Position ids shape: torch.Size([1, 31143]) Input IDs shape: torch.Size([1, 31143]) Labels shape: torch.Size([1, 31143]) Final batch size: 1, sequence length: 40088 Attention mask shape: torch.Size([1, 1, 40088, 40088]) Position ids shape: torch.Size([1, 40088]) Input IDs shape: torch.Size([1, 40088]) Labels shape: torch.Size([1, 40088]) Final batch size: 1, sequence length: 14828 Attention mask shape: torch.Size([1, 1, 14828, 14828]) Position ids shape: torch.Size([1, 14828]) Input IDs shape: torch.Size([1, 14828]) Labels shape: torch.Size([1, 14828]) Final batch size: 1, sequence length: 14070 Attention mask shape: torch.Size([1, 1, 14070, 14070]) Position ids shape: torch.Size([1, 14070]) Input IDs shape: torch.Size([1, 14070]) Labels shape: torch.Size([1, 14070]) Final batch size: 1, sequence length: 26840 Attention mask shape: torch.Size([1, 1, 26840, 26840]) Position ids shape: torch.Size([1, 26840]) Input IDs shape: torch.Size([1, 26840]) Labels shape: torch.Size([1, 26840]) Final batch size: 1, sequence length: 18568 Attention mask shape: torch.Size([1, 1, 18568, 18568]) Position ids shape: torch.Size([1, 18568]) Input IDs shape: torch.Size([1, 18568]) Labels shape: torch.Size([1, 18568]) Final batch size: 1, sequence length: 10426 Attention mask shape: torch.Size([1, 1, 10426, 10426]) Position ids shape: torch.Size([1, 10426]) Input IDs shape: torch.Size([1, 10426]) Labels shape: torch.Size([1, 10426]) Final batch size: 1, sequence length: 11896 Attention mask shape: torch.Size([1, 1, 11896, 11896]) Position ids shape: torch.Size([1, 11896]) Input IDs shape: torch.Size([1, 11896]) Labels shape: torch.Size([1, 11896]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 29202 Attention mask shape: torch.Size([1, 1, 29202, 29202]) Position ids shape: torch.Size([1, 29202]) Input IDs shape: torch.Size([1, 29202]) Labels shape: torch.Size([1, 29202]) Final batch size: 1, sequence length: 30651 Attention mask shape: torch.Size([1, 1, 30651, 30651]) Position ids shape: torch.Size([1, 30651]) Input IDs shape: torch.Size([1, 30651]) Labels shape: torch.Size([1, 30651]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 37529 Attention mask shape: torch.Size([1, 1, 37529, 37529]) Position ids shape: torch.Size([1, 37529]) Input IDs shape: torch.Size([1, 37529]) Labels shape: torch.Size([1, 37529]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 34785 Attention mask shape: torch.Size([1, 1, 34785, 34785]) Position ids shape: torch.Size([1, 34785]) Input IDs shape: torch.Size([1, 34785]) Final batch size: 1, sequence length: 40960 Labels shape: torch.Size([1, 34785]) Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 35914 Attention mask shape: torch.Size([1, 1, 35914, 35914]) Position ids shape: torch.Size([1, 35914]) Input IDs shape: torch.Size([1, 35914]) Labels shape: torch.Size([1, 35914]) Final batch size: 1, sequence length: 16304 Attention mask shape: torch.Size([1, 1, 16304, 16304]) Position ids shape: torch.Size([1, 16304]) Input IDs shape: torch.Size([1, 16304]) Labels shape: torch.Size([1, 16304]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 18859 Attention mask shape: torch.Size([1, 1, 18859, 18859]) Position ids shape: torch.Size([1, 18859]) Input IDs shape: torch.Size([1, 18859]) Labels shape: torch.Size([1, 18859]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 26287 Attention mask shape: torch.Size([1, 1, 26287, 26287]) Position ids shape: torch.Size([1, 26287]) Input IDs shape: torch.Size([1, 26287]) Labels shape: torch.Size([1, 26287]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 10198 Attention mask shape: torch.Size([1, 1, 10198, 10198]) Position ids shape: torch.Size([1, 10198]) Input IDs shape: torch.Size([1, 10198]) Labels shape: torch.Size([1, 10198]) Final batch size: 1, sequence length: 23936 Attention mask shape: torch.Size([1, 1, 23936, 23936]) Position ids shape: torch.Size([1, 23936]) Input IDs shape: torch.Size([1, 23936]) Labels shape: torch.Size([1, 23936]) Final batch size: 1, sequence length: 32559 Attention mask shape: torch.Size([1, 1, 32559, 32559]) Position ids shape: torch.Size([1, 32559]) Input IDs shape: torch.Size([1, 32559]) Labels shape: torch.Size([1, 32559]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 27819 Attention mask shape: torch.Size([1, 1, 27819, 27819]) Position ids shape: torch.Size([1, 27819]) Input IDs shape: torch.Size([1, 27819]) Labels shape: torch.Size([1, 27819]) Final batch size: 1, sequence length: 36638 Attention mask shape: torch.Size([1, 1, 36638, 36638]) Position ids shape: torch.Size([1, 36638]) Input IDs shape: torch.Size([1, 36638]) Labels shape: torch.Size([1, 36638]) Final batch size: 1, sequence length: 19494 Attention mask shape: torch.Size([1, 1, 19494, 19494]) Position ids shape: torch.Size([1, 19494]) Input IDs shape: torch.Size([1, 19494]) Labels shape: torch.Size([1, 19494]) Final batch size: 1, sequence length: 12605 Attention mask shape: torch.Size([1, 1, 12605, 12605]) Position ids shape: torch.Size([1, 12605]) Input IDs shape: torch.Size([1, 12605]) Labels shape: torch.Size([1, 12605]) Final batch size: 1, sequence length: 31316 Attention mask shape: torch.Size([1, 1, 31316, 31316]) Position ids shape: torch.Size([1, 31316]) Input IDs shape: torch.Size([1, 31316]) Labels shape: torch.Size([1, 31316]) Final batch size: 1, sequence length: 30835 Attention mask shape: torch.Size([1, 1, 30835, 30835]) Position ids shape: torch.Size([1, 30835]) Input IDs shape: torch.Size([1, 30835]) Labels shape: torch.Size([1, 30835]) Final batch size: 1, sequence length: 26269 Attention mask shape: torch.Size([1, 1, 26269, 26269]) Position ids shape: torch.Size([1, 26269]) Input IDs shape: torch.Size([1, 26269]) Labels shape: torch.Size([1, 26269]) Final batch size: 1, sequence length: 31754 Attention mask shape: torch.Size([1, 1, 31754, 31754]) Position ids shape: torch.Size([1, 31754]) Input IDs shape: torch.Size([1, 31754]) Labels shape: torch.Size([1, 31754]) Final batch size: 1, sequence length: 40763 Attention mask shape: torch.Size([1, 1, 40763, 40763]) Position ids shape: torch.Size([1, 40763]) Input IDs shape: torch.Size([1, 40763]) Labels shape: torch.Size([1, 40763]) Final batch size: 1, sequence length: 25573 Attention mask shape: torch.Size([1, 1, 25573, 25573]) Position ids shape: torch.Size([1, 25573]) Input IDs shape: torch.Size([1, 25573]) Labels shape: torch.Size([1, 25573]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 37170 Attention mask shape: torch.Size([1, 1, 37170, 37170]) Position ids shape: torch.Size([1, 37170]) Input IDs shape: torch.Size([1, 37170]) Labels shape: torch.Size([1, 37170]) {'loss': 0.2622, 'grad_norm': 0.19964837262868357, 'learning_rate': 1.4644660940672628e-06, 'num_tokens': -inf, 'epoch': 6.25} Final batch size: 1, sequence length: 7998 Attention mask shape: torch.Size([1, 1, 7998, 7998]) Position ids shape: torch.Size([1, 7998]) Input IDs shape: torch.Size([1, 7998]) Labels shape: torch.Size([1, 7998]) Final batch size: 1, sequence length: 6925 Attention mask shape: torch.Size([1, 1, 6925, 6925]) Position ids shape: torch.Size([1, 6925]) Input IDs shape: torch.Size([1, 6925]) Labels shape: torch.Size([1, 6925]) Final batch size: 1, sequence length: 7402 Attention mask shape: torch.Size([1, 1, 7402, 7402]) Position ids shape: torch.Size([1, 7402]) Input IDs shape: torch.Size([1, 7402]) Labels shape: torch.Size([1, 7402]) Final batch size: 1, sequence length: 10102 Attention mask shape: torch.Size([1, 1, 10102, 10102]) Position ids shape: torch.Size([1, 10102]) Input IDs shape: torch.Size([1, 10102]) Labels shape: torch.Size([1, 10102]) Final batch size: 1, sequence length: 9243 Attention mask shape: torch.Size([1, 1, 9243, 9243]) Position ids shape: torch.Size([1, 9243]) Input IDs shape: torch.Size([1, 9243]) Labels shape: torch.Size([1, 9243]) Final batch size: 1, sequence length: 9452 Attention mask shape: torch.Size([1, 1, 9452, 9452]) Position ids shape: torch.Size([1, 9452]) Input IDs shape: torch.Size([1, 9452]) Labels shape: torch.Size([1, 9452]) Final batch size: 1, sequence length: 10862 Attention mask shape: torch.Size([1, 1, 10862, 10862]) Position ids shape: torch.Size([1, 10862]) Input IDs shape: torch.Size([1, 10862]) Labels shape: torch.Size([1, 10862]) Final batch size: 1, sequence length: 11947 Attention mask shape: torch.Size([1, 1, 11947, 11947]) Position ids shape: torch.Size([1, 11947]) Input IDs shape: torch.Size([1, 11947]) Labels shape: torch.Size([1, 11947]) Final batch size: 1, sequence length: 12719 Attention mask shape: torch.Size([1, 1, 12719, 12719]) Position ids shape: torch.Size([1, 12719]) Input IDs shape: torch.Size([1, 12719]) Labels shape: torch.Size([1, 12719]) Final batch size: 1, sequence length: 10804 Attention mask shape: torch.Size([1, 1, 10804, 10804]) Position ids shape: torch.Size([1, 10804]) Input IDs shape: torch.Size([1, 10804]) Labels shape: torch.Size([1, 10804]) Final batch size: 1, sequence length: 8432 Attention mask shape: torch.Size([1, 1, 8432, 8432]) Position ids shape: torch.Size([1, 8432]) Input IDs shape: torch.Size([1, 8432]) Labels shape: torch.Size([1, 8432]) Final batch size: 1, sequence length: 11623 Attention mask shape: torch.Size([1, 1, 11623, 11623]) Position ids shape: torch.Size([1, 11623]) Input IDs shape: torch.Size([1, 11623]) Labels shape: torch.Size([1, 11623]) Final batch size: 1, sequence length: 13209 Attention mask shape: torch.Size([1, 1, 13209, 13209]) Position ids shape: torch.Size([1, 13209]) Input IDs shape: torch.Size([1, 13209]) Labels shape: torch.Size([1, 13209]) Final batch size: 1, sequence length: 16060 Attention mask shape: torch.Size([1, 1, 16060, 16060]) Position ids shape: torch.Size([1, 16060]) Input IDs shape: torch.Size([1, 16060]) Labels shape: torch.Size([1, 16060]) Final batch size: 1, sequence length: 15886 Attention mask shape: torch.Size([1, 1, 15886, 15886]) Position ids shape: torch.Size([1, 15886]) Input IDs shape: torch.Size([1, 15886]) Labels shape: torch.Size([1, 15886]) Final batch size: 1, sequence length: 15053 Attention mask shape: torch.Size([1, 1, 15053, 15053]) Position ids shape: torch.Size([1, 15053]) Input IDs shape: torch.Size([1, 15053]) Labels shape: torch.Size([1, 15053]) Final batch size: 1, sequence length: 17370 Attention mask shape: torch.Size([1, 1, 17370, 17370]) Position ids shape: torch.Size([1, 17370]) Input IDs shape: torch.Size([1, 17370]) Labels shape: torch.Size([1, 17370]) Final batch size: 1, sequence length: 17376 Attention mask shape: torch.Size([1, 1, 17376, 17376]) Position ids shape: torch.Size([1, 17376]) Input IDs shape: torch.Size([1, 17376]) Labels shape: torch.Size([1, 17376]) Final batch size: 1, sequence length: 17918 Attention mask shape: torch.Size([1, 1, 17918, 17918]) Position ids shape: torch.Size([1, 17918]) Input IDs shape: torch.Size([1, 17918]) Labels shape: torch.Size([1, 17918]) Final batch size: 1, sequence length: 15794 Attention mask shape: torch.Size([1, 1, 15794, 15794]) Position ids shape: torch.Size([1, 15794]) Input IDs shape: torch.Size([1, 15794]) Labels shape: torch.Size([1, 15794]) Final batch size: 1, sequence length: 17767 Attention mask shape: torch.Size([1, 1, 17767, 17767]) Position ids shape: torch.Size([1, 17767]) Input IDs shape: torch.Size([1, 17767]) Labels shape: torch.Size([1, 17767]) Final batch size: 1, sequence length: 19935 Attention mask shape: torch.Size([1, 1, 19935, 19935]) Position ids shape: torch.Size([1, 19935]) Input IDs shape: torch.Size([1, 19935]) Labels shape: torch.Size([1, 19935]) Final batch size: 1, sequence length: 20185 Attention mask shape: torch.Size([1, 1, 20185, 20185]) Position ids shape: torch.Size([1, 20185]) Input IDs shape: torch.Size([1, 20185]) Labels shape: torch.Size([1, 20185]) Final batch size: 1, sequence length: 16536 Attention mask shape: torch.Size([1, 1, 16536, 16536]) Position ids shape: torch.Size([1, 16536]) Input IDs shape: torch.Size([1, 16536]) Labels shape: torch.Size([1, 16536]) Final batch size: 1, sequence length: 20554 Attention mask shape: torch.Size([1, 1, 20554, 20554]) Position ids shape: torch.Size([1, 20554]) Input IDs shape: torch.Size([1, 20554]) Labels shape: torch.Size([1, 20554]) Final batch size: 1, sequence length: 20056 Attention mask shape: torch.Size([1, 1, 20056, 20056]) Position ids shape: torch.Size([1, 20056]) Input IDs shape: torch.Size([1, 20056]) Labels shape: torch.Size([1, 20056]) Final batch size: 1, sequence length: 21117 Attention mask shape: torch.Size([1, 1, 21117, 21117]) Position ids shape: torch.Size([1, 21117]) Input IDs shape: torch.Size([1, 21117]) Labels shape: torch.Size([1, 21117]) Final batch size: 1, sequence length: 21980 Attention mask shape: torch.Size([1, 1, 21980, 21980]) Position ids shape: torch.Size([1, 21980]) Input IDs shape: torch.Size([1, 21980]) Labels shape: torch.Size([1, 21980]) Final batch size: 1, sequence length: 8498 Attention mask shape: torch.Size([1, 1, 8498, 8498]) Position ids shape: torch.Size([1, 8498]) Input IDs shape: torch.Size([1, 8498]) Labels shape: torch.Size([1, 8498]) Final batch size: 1, sequence length: 19609 Attention mask shape: torch.Size([1, 1, 19609, 19609]) Position ids shape: torch.Size([1, 19609]) Input IDs shape: torch.Size([1, 19609]) Labels shape: torch.Size([1, 19609]) Final batch size: 1, sequence length: 20524 Attention mask shape: torch.Size([1, 1, 20524, 20524]) Position ids shape: torch.Size([1, 20524]) Input IDs shape: torch.Size([1, 20524]) Labels shape: torch.Size([1, 20524]) Final batch size: 1, sequence length: 19629 Attention mask shape: torch.Size([1, 1, 19629, 19629]) Position ids shape: torch.Size([1, 19629]) Input IDs shape: torch.Size([1, 19629]) Labels shape: torch.Size([1, 19629]) Final batch size: 1, sequence length: 22797 Attention mask shape: torch.Size([1, 1, 22797, 22797]) Position ids shape: torch.Size([1, 22797]) Input IDs shape: torch.Size([1, 22797]) Labels shape: torch.Size([1, 22797]) Final batch size: 1, sequence length: 21771 Attention mask shape: torch.Size([1, 1, 21771, 21771]) Position ids shape: torch.Size([1, 21771]) Input IDs shape: torch.Size([1, 21771]) Labels shape: torch.Size([1, 21771]) Final batch size: 1, sequence length: 17625 Final batch size: 1, sequence length: 22208 Attention mask shape: torch.Size([1, 1, 17625, 17625]) Position ids shape: torch.Size([1, 17625]) Input IDs shape: torch.Size([1, 17625]) Labels shape: torch.Size([1, 17625]) Attention mask shape: torch.Size([1, 1, 22208, 22208]) Position ids shape: torch.Size([1, 22208]) Input IDs shape: torch.Size([1, 22208]) Labels shape: torch.Size([1, 22208]) Final batch size: 1, sequence length: 12006 Attention mask shape: torch.Size([1, 1, 12006, 12006]) Position ids shape: torch.Size([1, 12006]) Input IDs shape: torch.Size([1, 12006]) Labels shape: torch.Size([1, 12006]) Final batch size: 1, sequence length: 21421 Attention mask shape: torch.Size([1, 1, 21421, 21421]) Position ids shape: torch.Size([1, 21421]) Input IDs shape: torch.Size([1, 21421]) Labels shape: torch.Size([1, 21421]) Final batch size: 1, sequence length: 16141 Attention mask shape: torch.Size([1, 1, 16141, 16141]) Position ids shape: torch.Size([1, 16141]) Input IDs shape: torch.Size([1, 16141]) Labels shape: torch.Size([1, 16141]) Final batch size: 1, sequence length: 17665 Attention mask shape: torch.Size([1, 1, 17665, 17665]) Position ids shape: torch.Size([1, 17665]) Input IDs shape: torch.Size([1, 17665]) Labels shape: torch.Size([1, 17665]) Final batch size: 1, sequence length: 20816 Attention mask shape: torch.Size([1, 1, 20816, 20816]) Position ids shape: torch.Size([1, 20816]) Input IDs shape: torch.Size([1, 20816]) Labels shape: torch.Size([1, 20816]) Final batch size: 1, sequence length: 22683 Attention mask shape: torch.Size([1, 1, 22683, 22683]) Position ids shape: torch.Size([1, 22683]) Input IDs shape: torch.Size([1, 22683]) Labels shape: torch.Size([1, 22683]) Final batch size: 1, sequence length: 16439 Attention mask shape: torch.Size([1, 1, 16439, 16439]) Position ids shape: torch.Size([1, 16439]) Input IDs shape: torch.Size([1, 16439]) Labels shape: torch.Size([1, 16439]) Final batch size: 1, sequence length: 26121 Attention mask shape: torch.Size([1, 1, 26121, 26121]) Position ids shape: torch.Size([1, 26121]) Input IDs shape: torch.Size([1, 26121]) Labels shape: torch.Size([1, 26121]) Final batch size: 1, sequence length: 26142 Attention mask shape: torch.Size([1, 1, 26142, 26142]) Position ids shape: torch.Size([1, 26142]) Input IDs shape: torch.Size([1, 26142]) Labels shape: torch.Size([1, 26142]) Final batch size: 1, sequence length: 24308 Attention mask shape: torch.Size([1, 1, 24308, 24308]) Position ids shape: torch.Size([1, 24308]) Input IDs shape: torch.Size([1, 24308]) Labels shape: torch.Size([1, 24308]) Final batch size: 1, sequence length: 24433 Attention mask shape: torch.Size([1, 1, 24433, 24433]) Position ids shape: torch.Size([1, 24433]) Input IDs shape: torch.Size([1, 24433]) Labels shape: torch.Size([1, 24433]) Final batch size: 1, sequence length: 18991 Attention mask shape: torch.Size([1, 1, 18991, 18991]) Position ids shape: torch.Size([1, 18991]) Input IDs shape: torch.Size([1, 18991]) Labels shape: torch.Size([1, 18991]) Final batch size: 1, sequence length: 24515 Attention mask shape: torch.Size([1, 1, 24515, 24515]) Position ids shape: torch.Size([1, 24515]) Input IDs shape: torch.Size([1, 24515]) Labels shape: torch.Size([1, 24515]) Final batch size: 1, sequence length: 27179 Attention mask shape: torch.Size([1, 1, 27179, 27179]) Position ids shape: torch.Size([1, 27179]) Input IDs shape: torch.Size([1, 27179]) Labels shape: torch.Size([1, 27179]) Final batch size: 1, sequence length: 16564 Attention mask shape: torch.Size([1, 1, 16564, 16564]) Position ids shape: torch.Size([1, 16564]) Input IDs shape: torch.Size([1, 16564]) Labels shape: torch.Size([1, 16564]) Final batch size: 1, sequence length: 18122 Attention mask shape: torch.Size([1, 1, 18122, 18122]) Position ids shape: torch.Size([1, 18122]) Input IDs shape: torch.Size([1, 18122]) Labels shape: torch.Size([1, 18122]) Final batch size: 1, sequence length: 10857 Attention mask shape: torch.Size([1, 1, 10857, 10857]) Position ids shape: torch.Size([1, 10857]) Input IDs shape: torch.Size([1, 10857]) Labels shape: torch.Size([1, 10857]) Final batch size: 1, sequence length: 26449 Attention mask shape: torch.Size([1, 1, 26449, 26449]) Position ids shape: torch.Size([1, 26449]) Input IDs shape: torch.Size([1, 26449]) Labels shape: torch.Size([1, 26449]) Final batch size: 1, sequence length: 28164 Attention mask shape: torch.Size([1, 1, 28164, 28164]) Position ids shape: torch.Size([1, 28164]) Input IDs shape: torch.Size([1, 28164]) Labels shape: torch.Size([1, 28164]) Final batch size: 1, sequence length: 17363 Attention mask shape: torch.Size([1, 1, 17363, 17363]) Position ids shape: torch.Size([1, 17363]) Input IDs shape: torch.Size([1, 17363]) Labels shape: torch.Size([1, 17363]) Final batch size: 1, sequence length: 30181 Attention mask shape: torch.Size([1, 1, 30181, 30181]) Position ids shape: torch.Size([1, 30181]) Input IDs shape: torch.Size([1, 30181]) Labels shape: torch.Size([1, 30181]) Final batch size: 1, sequence length: 31339 Attention mask shape: torch.Size([1, 1, 31339, 31339]) Position ids shape: torch.Size([1, 31339]) Input IDs shape: torch.Size([1, 31339]) Labels shape: torch.Size([1, 31339]) Final batch size: 1, sequence length: 30197 Attention mask shape: torch.Size([1, 1, 30197, 30197]) Position ids shape: torch.Size([1, 30197]) Input IDs shape: torch.Size([1, 30197]) Labels shape: torch.Size([1, 30197]) Final batch size: 1, sequence length: 24802 Attention mask shape: torch.Size([1, 1, 24802, 24802]) Position ids shape: torch.Size([1, 24802]) Input IDs shape: torch.Size([1, 24802]) Labels shape: torch.Size([1, 24802]) Final batch size: 1, sequence length: 21152 Attention mask shape: torch.Size([1, 1, 21152, 21152]) Position ids shape: torch.Size([1, 21152]) Input IDs shape: torch.Size([1, 21152]) Labels shape: torch.Size([1, 21152]) Final batch size: 1, sequence length: 32466 Attention mask shape: torch.Size([1, 1, 32466, 32466]) Position ids shape: torch.Size([1, 32466]) Input IDs shape: torch.Size([1, 32466]) Labels shape: torch.Size([1, 32466]) Final batch size: 1, sequence length: 29628 Attention mask shape: torch.Size([1, 1, 29628, 29628]) Position ids shape: torch.Size([1, 29628]) Input IDs shape: torch.Size([1, 29628]) Labels shape: torch.Size([1, 29628]) Final batch size: 1, sequence length: 31507 Attention mask shape: torch.Size([1, 1, 31507, 31507]) Position ids shape: torch.Size([1, 31507]) Input IDs shape: torch.Size([1, 31507]) Labels shape: torch.Size([1, 31507]) Final batch size: 1, sequence length: 29392 Attention mask shape: torch.Size([1, 1, 29392, 29392]) Position ids shape: torch.Size([1, 29392]) Input IDs shape: torch.Size([1, 29392]) Labels shape: torch.Size([1, 29392]) Final batch size: 1, sequence length: 16837 Attention mask shape: torch.Size([1, 1, 16837, 16837]) Position ids shape: torch.Size([1, 16837]) Input IDs shape: torch.Size([1, 16837]) Labels shape: torch.Size([1, 16837]) Final batch size: 1, sequence length: 24527 Attention mask shape: torch.Size([1, 1, 24527, 24527]) Position ids shape: torch.Size([1, 24527]) Input IDs shape: torch.Size([1, 24527]) Labels shape: torch.Size([1, 24527]) Final batch size: 1, sequence length: 32752 Attention mask shape: torch.Size([1, 1, 32752, 32752]) Position ids shape: torch.Size([1, 32752]) Input IDs shape: torch.Size([1, 32752]) Labels shape: torch.Size([1, 32752]) Final batch size: 1, sequence length: 31525 Attention mask shape: torch.Size([1, 1, 31525, 31525]) Position ids shape: torch.Size([1, 31525]) Input IDs shape: torch.Size([1, 31525]) Labels shape: torch.Size([1, 31525]) Final batch size: 1, sequence length: 22762 Attention mask shape: torch.Size([1, 1, 22762, 22762]) Position ids shape: torch.Size([1, 22762]) Input IDs shape: torch.Size([1, 22762]) Labels shape: torch.Size([1, 22762]) Final batch size: 1, sequence length: 34711 Attention mask shape: torch.Size([1, 1, 34711, 34711]) Position ids shape: torch.Size([1, 34711]) Input IDs shape: torch.Size([1, 34711]) Labels shape: torch.Size([1, 34711]) Final batch size: 1, sequence length: 33894 Attention mask shape: torch.Size([1, 1, 33894, 33894]) Position ids shape: torch.Size([1, 33894]) Input IDs shape: torch.Size([1, 33894]) Labels shape: torch.Size([1, 33894]) Final batch size: 1, sequence length: 11107 Attention mask shape: torch.Size([1, 1, 11107, 11107]) Position ids shape: torch.Size([1, 11107]) Input IDs shape: torch.Size([1, 11107]) Labels shape: torch.Size([1, 11107]) Final batch size: 1, sequence length: 20274 Attention mask shape: torch.Size([1, 1, 20274, 20274]) Position ids shape: torch.Size([1, 20274]) Input IDs shape: torch.Size([1, 20274]) Labels shape: torch.Size([1, 20274]) Final batch size: 1, sequence length: 32891 Attention mask shape: torch.Size([1, 1, 32891, 32891]) Position ids shape: torch.Size([1, 32891]) Input IDs shape: torch.Size([1, 32891]) Labels shape: torch.Size([1, 32891]) Final batch size: 1, sequence length: 35523 Attention mask shape: torch.Size([1, 1, 35523, 35523]) Position ids shape: torch.Size([1, 35523]) Input IDs shape: torch.Size([1, 35523]) Labels shape: torch.Size([1, 35523]) Final batch size: 1, sequence length: 21143 Attention mask shape: torch.Size([1, 1, 21143, 21143]) Position ids shape: torch.Size([1, 21143]) Input IDs shape: torch.Size([1, 21143]) Labels shape: torch.Size([1, 21143]) Final batch size: 1, sequence length: 33087 Attention mask shape: torch.Size([1, 1, 33087, 33087]) Position ids shape: torch.Size([1, 33087]) Input IDs shape: torch.Size([1, 33087]) Labels shape: torch.Size([1, 33087]) Final batch size: 1, sequence length: 33367 Attention mask shape: torch.Size([1, 1, 33367, 33367]) Position ids shape: torch.Size([1, 33367]) Input IDs shape: torch.Size([1, 33367]) Labels shape: torch.Size([1, 33367]) Final batch size: 1, sequence length: 17181 Attention mask shape: torch.Size([1, 1, 17181, 17181]) Position ids shape: torch.Size([1, 17181]) Input IDs shape: torch.Size([1, 17181]) Labels shape: torch.Size([1, 17181]) Final batch size: 1, sequence length: 27061 Attention mask shape: torch.Size([1, 1, 27061, 27061]) Position ids shape: torch.Size([1, 27061]) Input IDs shape: torch.Size([1, 27061]) Labels shape: torch.Size([1, 27061]) Final batch size: 1, sequence length: 34512 Attention mask shape: torch.Size([1, 1, 34512, 34512]) Position ids shape: torch.Size([1, 34512]) Input IDs shape: torch.Size([1, 34512]) Labels shape: torch.Size([1, 34512]) Final batch size: 1, sequence length: 34405 Attention mask shape: torch.Size([1, 1, 34405, 34405]) Position ids shape: torch.Size([1, 34405]) Input IDs shape: torch.Size([1, 34405]) Labels shape: torch.Size([1, 34405]) Final batch size: 1, sequence length: 13942 Attention mask shape: torch.Size([1, 1, 13942, 13942]) Position ids shape: torch.Size([1, 13942]) Input IDs shape: torch.Size([1, 13942]) Labels shape: torch.Size([1, 13942]) Final batch size: 1, sequence length: 37343 Attention mask shape: torch.Size([1, 1, 37343, 37343]) Position ids shape: torch.Size([1, 37343]) Input IDs shape: torch.Size([1, 37343]) Labels shape: torch.Size([1, 37343]) Final batch size: 1, sequence length: 33794 Attention mask shape: torch.Size([1, 1, 33794, 33794]) Position ids shape: torch.Size([1, 33794]) Input IDs shape: torch.Size([1, 33794]) Labels shape: torch.Size([1, 33794]) Final batch size: 1, sequence length: 37555 Attention mask shape: torch.Size([1, 1, 37555, 37555]) Position ids shape: torch.Size([1, 37555]) Input IDs shape: torch.Size([1, 37555]) Labels shape: torch.Size([1, 37555]) Final batch size: 1, sequence length: 18229 Attention mask shape: torch.Size([1, 1, 18229, 18229]) Position ids shape: torch.Size([1, 18229]) Input IDs shape: torch.Size([1, 18229]) Labels shape: torch.Size([1, 18229]) Final batch size: 1, sequence length: 38010 Attention mask shape: torch.Size([1, 1, 38010, 38010]) Position ids shape: torch.Size([1, 38010]) Input IDs shape: torch.Size([1, 38010]) Labels shape: torch.Size([1, 38010]) Final batch size: 1, sequence length: 31875 Attention mask shape: torch.Size([1, 1, 31875, 31875]) Position ids shape: torch.Size([1, 31875]) Input IDs shape: torch.Size([1, 31875]) Labels shape: torch.Size([1, 31875]) Final batch size: 1, sequence length: 31518 Attention mask shape: torch.Size([1, 1, 31518, 31518]) Position ids shape: torch.Size([1, 31518]) Input IDs shape: torch.Size([1, 31518]) Labels shape: torch.Size([1, 31518]) Final batch size: 1, sequence length: 21859 Attention mask shape: torch.Size([1, 1, 21859, 21859]) Position ids shape: torch.Size([1, 21859]) Input IDs shape: torch.Size([1, 21859]) Labels shape: torch.Size([1, 21859]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 31561 Attention mask shape: torch.Size([1, 1, 31561, 31561]) Position ids shape: torch.Size([1, 31561]) Input IDs shape: torch.Size([1, 31561]) Labels shape: torch.Size([1, 31561]) Final batch size: 1, sequence length: 36450 Attention mask shape: torch.Size([1, 1, 36450, 36450]) Position ids shape: torch.Size([1, 36450]) Input IDs shape: torch.Size([1, 36450]) Labels shape: torch.Size([1, 36450]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 13978 Attention mask shape: torch.Size([1, 1, 13978, 13978]) Position ids shape: torch.Size([1, 13978]) Input IDs shape: torch.Size([1, 13978]) Labels shape: torch.Size([1, 13978]) Final batch size: 1, sequence length: 16716 Attention mask shape: torch.Size([1, 1, 16716, 16716]) Position ids shape: torch.Size([1, 16716]) Input IDs shape: torch.Size([1, 16716]) Labels shape: torch.Size([1, 16716]) Final batch size: 1, sequence length: 40910 Attention mask shape: torch.Size([1, 1, 40910, 40910]) Position ids shape: torch.Size([1, 40910]) Input IDs shape: torch.Size([1, 40910]) Labels shape: torch.Size([1, 40910]) Final batch size: 1, sequence length: 16433 Attention mask shape: torch.Size([1, 1, 16433, 16433]) Position ids shape: torch.Size([1, 16433]) Input IDs shape: torch.Size([1, 16433]) Labels shape: torch.Size([1, 16433]) Final batch size: 1, sequence length: 19287 Attention mask shape: torch.Size([1, 1, 19287, 19287]) Position ids shape: torch.Size([1, 19287]) Input IDs shape: torch.Size([1, 19287]) Labels shape: torch.Size([1, 19287]) Final batch size: 1, sequence length: 29042 Attention mask shape: torch.Size([1, 1, 29042, 29042]) Position ids shape: torch.Size([1, 29042]) Input IDs shape: torch.Size([1, 29042]) Labels shape: torch.Size([1, 29042]) Final batch size: 1, sequence length: 36464 Attention mask shape: torch.Size([1, 1, 36464, 36464]) Position ids shape: torch.Size([1, 36464]) Input IDs shape: torch.Size([1, 36464]) Labels shape: torch.Size([1, 36464]) Final batch size: 1, sequence length: 30134 Attention mask shape: torch.Size([1, 1, 30134, 30134]) Position ids shape: torch.Size([1, 30134]) Input IDs shape: torch.Size([1, 30134]) Labels shape: torch.Size([1, 30134]) Final batch size: 1, sequence length: 37349 Attention mask shape: torch.Size([1, 1, 37349, 37349]) Position ids shape: torch.Size([1, 37349]) Input IDs shape: torch.Size([1, 37349]) Labels shape: torch.Size([1, 37349]) Final batch size: 1, sequence length: 26922 Attention mask shape: torch.Size([1, 1, 26922, 26922]) Position ids shape: torch.Size([1, 26922]) Input IDs shape: torch.Size([1, 26922]) Labels shape: torch.Size([1, 26922]) Final batch size: 1, sequence length: 17758 Attention mask shape: torch.Size([1, 1, 17758, 17758]) Position ids shape: torch.Size([1, 17758]) Input IDs shape: torch.Size([1, 17758]) Labels shape: torch.Size([1, 17758]) Final batch size: 1, sequence length: 29216 Attention mask shape: torch.Size([1, 1, 29216, 29216]) Position ids shape: torch.Size([1, 29216]) Input IDs shape: torch.Size([1, 29216]) Labels shape: torch.Size([1, 29216]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 38529 Attention mask shape: torch.Size([1, 1, 38529, 38529]) Position ids shape: torch.Size([1, 38529]) Input IDs shape: torch.Size([1, 38529]) Labels shape: torch.Size([1, 38529]) Final batch size: 1, sequence length: 40752 Attention mask shape: torch.Size([1, 1, 40752, 40752]) Position ids shape: torch.Size([1, 40752]) Input IDs shape: torch.Size([1, 40752]) Labels shape: torch.Size([1, 40752]) Final batch size: 1, sequence length: 39169 Attention mask shape: torch.Size([1, 1, 39169, 39169]) Position ids shape: torch.Size([1, 39169]) Input IDs shape: torch.Size([1, 39169]) Labels shape: torch.Size([1, 39169]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 26893 Attention mask shape: torch.Size([1, 1, 26893, 26893]) Position ids shape: torch.Size([1, 26893]) Input IDs shape: torch.Size([1, 26893]) Labels shape: torch.Size([1, 26893]) Final batch size: 1, sequence length: 17535 Attention mask shape: torch.Size([1, 1, 17535, 17535]) Position ids shape: torch.Size([1, 17535]) Input IDs shape: torch.Size([1, 17535]) Labels shape: torch.Size([1, 17535]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 38891 Attention mask shape: torch.Size([1, 1, 38891, 38891]) Position ids shape: torch.Size([1, 38891]) Input IDs shape: torch.Size([1, 38891]) Labels shape: torch.Size([1, 38891]) Final batch size: 1, sequence length: 31449 Attention mask shape: torch.Size([1, 1, 31449, 31449]) Position ids shape: torch.Size([1, 31449]) Input IDs shape: torch.Size([1, 31449]) Labels shape: torch.Size([1, 31449]) Final batch size: 1, sequence length: 32638 Attention mask shape: torch.Size([1, 1, 32638, 32638]) Position ids shape: torch.Size([1, 32638]) Input IDs shape: torch.Size([1, 32638]) Labels shape: torch.Size([1, 32638]) Final batch size: 1, sequence length: 36599 Attention mask shape: torch.Size([1, 1, 36599, 36599]) Position ids shape: torch.Size([1, 36599]) Input IDs shape: torch.Size([1, 36599]) Labels shape: torch.Size([1, 36599]) Final batch size: 1, sequence length: 39258 Attention mask shape: torch.Size([1, 1, 39258, 39258]) Position ids shape: torch.Size([1, 39258]) Input IDs shape: torch.Size([1, 39258]) Labels shape: torch.Size([1, 39258]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 17890 Attention mask shape: torch.Size([1, 1, 17890, 17890]) Position ids shape: torch.Size([1, 17890]) Input IDs shape: torch.Size([1, 17890]) Labels shape: torch.Size([1, 17890]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 18471 Attention mask shape: torch.Size([1, 1, 18471, 18471]) Position ids shape: torch.Size([1, 18471]) Input IDs shape: torch.Size([1, 18471]) Labels shape: torch.Size([1, 18471]) Final batch size: 1, sequence length: 32465 Attention mask shape: torch.Size([1, 1, 32465, 32465]) Position ids shape: torch.Size([1, 32465]) Input IDs shape: torch.Size([1, 32465]) Labels shape: torch.Size([1, 32465]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 39953 Attention mask shape: torch.Size([1, 1, 39953, 39953]) Position ids shape: torch.Size([1, 39953]) Input IDs shape: torch.Size([1, 39953]) Labels shape: torch.Size([1, 39953]) {'loss': 0.2507, 'grad_norm': 0.17115458997667415, 'learning_rate': 1.2842758726130283e-06, 'num_tokens': -inf, 'epoch': 6.38} Final batch size: 1, sequence length: 6215 Attention mask shape: torch.Size([1, 1, 6215, 6215]) Position ids shape: torch.Size([1, 6215]) Input IDs shape: torch.Size([1, 6215]) Labels shape: torch.Size([1, 6215]) Final batch size: 1, sequence length: 5818 Attention mask shape: torch.Size([1, 1, 5818, 5818]) Position ids shape: torch.Size([1, 5818]) Input IDs shape: torch.Size([1, 5818]) Labels shape: torch.Size([1, 5818]) Final batch size: 1, sequence length: 7977 Attention mask shape: torch.Size([1, 1, 7977, 7977]) Position ids shape: torch.Size([1, 7977]) Input IDs shape: torch.Size([1, 7977]) Labels shape: torch.Size([1, 7977]) Final batch size: 1, sequence length: 8500 Attention mask shape: torch.Size([1, 1, 8500, 8500]) Position ids shape: torch.Size([1, 8500]) Input IDs shape: torch.Size([1, 8500]) Labels shape: torch.Size([1, 8500]) Final batch size: 1, sequence length: 10107 Attention mask shape: torch.Size([1, 1, 10107, 10107]) Position ids shape: torch.Size([1, 10107]) Input IDs shape: torch.Size([1, 10107]) Labels shape: torch.Size([1, 10107]) Final batch size: 1, sequence length: 10080 Attention mask shape: torch.Size([1, 1, 10080, 10080]) Position ids shape: torch.Size([1, 10080]) Input IDs shape: torch.Size([1, 10080]) Labels shape: torch.Size([1, 10080]) Final batch size: 1, sequence length: 12826 Attention mask shape: torch.Size([1, 1, 12826, 12826]) Position ids shape: torch.Size([1, 12826]) Input IDs shape: torch.Size([1, 12826]) Labels shape: torch.Size([1, 12826]) Final batch size: 1, sequence length: 9217 Attention mask shape: torch.Size([1, 1, 9217, 9217]) Position ids shape: torch.Size([1, 9217]) Input IDs shape: torch.Size([1, 9217]) Labels shape: torch.Size([1, 9217]) Final batch size: 1, sequence length: 10505 Attention mask shape: torch.Size([1, 1, 10505, 10505]) Position ids shape: torch.Size([1, 10505]) Input IDs shape: torch.Size([1, 10505]) Labels shape: torch.Size([1, 10505]) Final batch size: 1, sequence length: 12928 Attention mask shape: torch.Size([1, 1, 12928, 12928]) Position ids shape: torch.Size([1, 12928]) Input IDs shape: torch.Size([1, 12928]) Labels shape: torch.Size([1, 12928]) Final batch size: 1, sequence length: 12556 Attention mask shape: torch.Size([1, 1, 12556, 12556]) Position ids shape: torch.Size([1, 12556]) Input IDs shape: torch.Size([1, 12556]) Labels shape: torch.Size([1, 12556]) Final batch size: 1, sequence length: 6095 Attention mask shape: torch.Size([1, 1, 6095, 6095]) Position ids shape: torch.Size([1, 6095]) Input IDs shape: torch.Size([1, 6095]) Labels shape: torch.Size([1, 6095]) Final batch size: 1, sequence length: 9379 Attention mask shape: torch.Size([1, 1, 9379, 9379]) Position ids shape: torch.Size([1, 9379]) Input IDs shape: torch.Size([1, 9379]) Labels shape: torch.Size([1, 9379]) Final batch size: 1, sequence length: 13092 Attention mask shape: torch.Size([1, 1, 13092, 13092]) Position ids shape: torch.Size([1, 13092]) Input IDs shape: torch.Size([1, 13092]) Labels shape: torch.Size([1, 13092]) Final batch size: 1, sequence length: 10687 Attention mask shape: torch.Size([1, 1, 10687, 10687]) Position ids shape: torch.Size([1, 10687]) Input IDs shape: torch.Size([1, 10687]) Labels shape: torch.Size([1, 10687]) Final batch size: 1, sequence length: 13624 Attention mask shape: torch.Size([1, 1, 13624, 13624]) Position ids shape: torch.Size([1, 13624]) Input IDs shape: torch.Size([1, 13624]) Labels shape: torch.Size([1, 13624]) Final batch size: 1, sequence length: 16782 Attention mask shape: torch.Size([1, 1, 16782, 16782]) Position ids shape: torch.Size([1, 16782]) Input IDs shape: torch.Size([1, 16782]) Labels shape: torch.Size([1, 16782]) Final batch size: 1, sequence length: 10469 Attention mask shape: torch.Size([1, 1, 10469, 10469]) Position ids shape: torch.Size([1, 10469]) Input IDs shape: torch.Size([1, 10469]) Labels shape: torch.Size([1, 10469]) Final batch size: 1, sequence length: 12454 Attention mask shape: torch.Size([1, 1, 12454, 12454]) Position ids shape: torch.Size([1, 12454]) Input IDs shape: torch.Size([1, 12454]) Labels shape: torch.Size([1, 12454]) Final batch size: 1, sequence length: 13459 Attention mask shape: torch.Size([1, 1, 13459, 13459]) Position ids shape: torch.Size([1, 13459]) Input IDs shape: torch.Size([1, 13459]) Labels shape: torch.Size([1, 13459]) Final batch size: 1, sequence length: 17980 Attention mask shape: torch.Size([1, 1, 17980, 17980]) Position ids shape: torch.Size([1, 17980]) Input IDs shape: torch.Size([1, 17980]) Labels shape: torch.Size([1, 17980]) Final batch size: 1, sequence length: 18454 Attention mask shape: torch.Size([1, 1, 18454, 18454]) Position ids shape: torch.Size([1, 18454]) Input IDs shape: torch.Size([1, 18454]) Labels shape: torch.Size([1, 18454]) Final batch size: 1, sequence length: 10390 Attention mask shape: torch.Size([1, 1, 10390, 10390]) Position ids shape: torch.Size([1, 10390]) Input IDs shape: torch.Size([1, 10390]) Labels shape: torch.Size([1, 10390]) Final batch size: 1, sequence length: 15185 Attention mask shape: torch.Size([1, 1, 15185, 15185]) Position ids shape: torch.Size([1, 15185]) Input IDs shape: torch.Size([1, 15185]) Labels shape: torch.Size([1, 15185]) Final batch size: 1, sequence length: 19683 Attention mask shape: torch.Size([1, 1, 19683, 19683]) Position ids shape: torch.Size([1, 19683]) Input IDs shape: torch.Size([1, 19683]) Labels shape: torch.Size([1, 19683]) Final batch size: 1, sequence length: 19671 Attention mask shape: torch.Size([1, 1, 19671, 19671]) Position ids shape: torch.Size([1, 19671]) Input IDs shape: torch.Size([1, 19671]) Labels shape: torch.Size([1, 19671]) Final batch size: 1, sequence length: 20191 Attention mask shape: torch.Size([1, 1, 20191, 20191]) Position ids shape: torch.Size([1, 20191]) Input IDs shape: torch.Size([1, 20191]) Labels shape: torch.Size([1, 20191]) Final batch size: 1, sequence length: 10277 Attention mask shape: torch.Size([1, 1, 10277, 10277]) Position ids shape: torch.Size([1, 10277]) Input IDs shape: torch.Size([1, 10277]) Labels shape: torch.Size([1, 10277]) Final batch size: 1, sequence length: 18469 Attention mask shape: torch.Size([1, 1, 18469, 18469]) Position ids shape: torch.Size([1, 18469]) Input IDs shape: torch.Size([1, 18469]) Labels shape: torch.Size([1, 18469]) Final batch size: 1, sequence length: 20243 Attention mask shape: torch.Size([1, 1, 20243, 20243]) Position ids shape: torch.Size([1, 20243]) Input IDs shape: torch.Size([1, 20243]) Labels shape: torch.Size([1, 20243]) Final batch size: 1, sequence length: 16585 Attention mask shape: torch.Size([1, 1, 16585, 16585]) Position ids shape: torch.Size([1, 16585]) Input IDs shape: torch.Size([1, 16585]) Labels shape: torch.Size([1, 16585]) Final batch size: 1, sequence length: 20432 Attention mask shape: torch.Size([1, 1, 20432, 20432]) Position ids shape: torch.Size([1, 20432]) Input IDs shape: torch.Size([1, 20432]) Labels shape: torch.Size([1, 20432]) Final batch size: 1, sequence length: 22133 Attention mask shape: torch.Size([1, 1, 22133, 22133]) Position ids shape: torch.Size([1, 22133]) Input IDs shape: torch.Size([1, 22133]) Labels shape: torch.Size([1, 22133]) Final batch size: 1, sequence length: 17417 Attention mask shape: torch.Size([1, 1, 17417, 17417]) Position ids shape: torch.Size([1, 17417]) Input IDs shape: torch.Size([1, 17417]) Labels shape: torch.Size([1, 17417]) Final batch size: 1, sequence length: 21664 Attention mask shape: torch.Size([1, 1, 21664, 21664]) Position ids shape: torch.Size([1, 21664]) Input IDs shape: torch.Size([1, 21664]) Labels shape: torch.Size([1, 21664]) Final batch size: 1, sequence length: 14892 Attention mask shape: torch.Size([1, 1, 14892, 14892]) Position ids shape: torch.Size([1, 14892]) Input IDs shape: torch.Size([1, 14892]) Labels shape: torch.Size([1, 14892]) Final batch size: 1, sequence length: 15263 Attention mask shape: torch.Size([1, 1, 15263, 15263]) Position ids shape: torch.Size([1, 15263]) Input IDs shape: torch.Size([1, 15263]) Labels shape: torch.Size([1, 15263]) Final batch size: 1, sequence length: 15339 Attention mask shape: torch.Size([1, 1, 15339, 15339]) Position ids shape: torch.Size([1, 15339]) Input IDs shape: torch.Size([1, 15339]) Labels shape: torch.Size([1, 15339]) Final batch size: 1, sequence length: 10182 Attention mask shape: torch.Size([1, 1, 10182, 10182]) Position ids shape: torch.Size([1, 10182]) Input IDs shape: torch.Size([1, 10182]) Labels shape: torch.Size([1, 10182]) Final batch size: 1, sequence length: 12012 Attention mask shape: torch.Size([1, 1, 12012, 12012]) Position ids shape: torch.Size([1, 12012]) Input IDs shape: torch.Size([1, 12012]) Labels shape: torch.Size([1, 12012]) Final batch size: 1, sequence length: 24255 Attention mask shape: torch.Size([1, 1, 24255, 24255]) Position ids shape: torch.Size([1, 24255]) Input IDs shape: torch.Size([1, 24255]) Labels shape: torch.Size([1, 24255]) Final batch size: 1, sequence length: 19187 Attention mask shape: torch.Size([1, 1, 19187, 19187]) Position ids shape: torch.Size([1, 19187]) Input IDs shape: torch.Size([1, 19187]) Labels shape: torch.Size([1, 19187]) Final batch size: 1, sequence length: 24769 Attention mask shape: torch.Size([1, 1, 24769, 24769]) Position ids shape: torch.Size([1, 24769]) Input IDs shape: torch.Size([1, 24769]) Labels shape: torch.Size([1, 24769]) Final batch size: 1, sequence length: 23238 Attention mask shape: torch.Size([1, 1, 23238, 23238]) Position ids shape: torch.Size([1, 23238]) Input IDs shape: torch.Size([1, 23238]) Labels shape: torch.Size([1, 23238]) Final batch size: 1, sequence length: 21051 Attention mask shape: torch.Size([1, 1, 21051, 21051]) Position ids shape: torch.Size([1, 21051]) Input IDs shape: torch.Size([1, 21051]) Labels shape: torch.Size([1, 21051]) Final batch size: 1, sequence length: 24002 Attention mask shape: torch.Size([1, 1, 24002, 24002]) Position ids shape: torch.Size([1, 24002]) Input IDs shape: torch.Size([1, 24002]) Labels shape: torch.Size([1, 24002]) Final batch size: 1, sequence length: 24428 Attention mask shape: torch.Size([1, 1, 24428, 24428]) Position ids shape: torch.Size([1, 24428]) Input IDs shape: torch.Size([1, 24428]) Labels shape: torch.Size([1, 24428]) Final batch size: 1, sequence length: 24858 Attention mask shape: torch.Size([1, 1, 24858, 24858]) Position ids shape: torch.Size([1, 24858]) Input IDs shape: torch.Size([1, 24858]) Labels shape: torch.Size([1, 24858]) Final batch size: 1, sequence length: 20562 Attention mask shape: torch.Size([1, 1, 20562, 20562]) Position ids shape: torch.Size([1, 20562]) Input IDs shape: torch.Size([1, 20562]) Labels shape: torch.Size([1, 20562]) Final batch size: 1, sequence length: 16541 Attention mask shape: torch.Size([1, 1, 16541, 16541]) Position ids shape: torch.Size([1, 16541]) Input IDs shape: torch.Size([1, 16541]) Labels shape: torch.Size([1, 16541]) Final batch size: 1, sequence length: 14556 Attention mask shape: torch.Size([1, 1, 14556, 14556]) Position ids shape: torch.Size([1, 14556]) Input IDs shape: torch.Size([1, 14556]) Labels shape: torch.Size([1, 14556]) Final batch size: 1, sequence length: 22857 Attention mask shape: torch.Size([1, 1, 22857, 22857]) Position ids shape: torch.Size([1, 22857]) Input IDs shape: torch.Size([1, 22857]) Labels shape: torch.Size([1, 22857]) Final batch size: 1, sequence length: 25656 Attention mask shape: torch.Size([1, 1, 25656, 25656]) Position ids shape: torch.Size([1, 25656]) Input IDs shape: torch.Size([1, 25656]) Labels shape: torch.Size([1, 25656]) Final batch size: 1, sequence length: 26144 Attention mask shape: torch.Size([1, 1, 26144, 26144]) Position ids shape: torch.Size([1, 26144]) Input IDs shape: torch.Size([1, 26144]) Labels shape: torch.Size([1, 26144]) Final batch size: 1, sequence length: 26312 Attention mask shape: torch.Size([1, 1, 26312, 26312]) Position ids shape: torch.Size([1, 26312]) Input IDs shape: torch.Size([1, 26312]) Labels shape: torch.Size([1, 26312]) Final batch size: 1, sequence length: 26033 Attention mask shape: torch.Size([1, 1, 26033, 26033]) Position ids shape: torch.Size([1, 26033]) Input IDs shape: torch.Size([1, 26033]) Labels shape: torch.Size([1, 26033]) Final batch size: 1, sequence length: 23602 Attention mask shape: torch.Size([1, 1, 23602, 23602]) Position ids shape: torch.Size([1, 23602]) Input IDs shape: torch.Size([1, 23602]) Labels shape: torch.Size([1, 23602]) Final batch size: 1, sequence length: 25886 Attention mask shape: torch.Size([1, 1, 25886, 25886]) Position ids shape: torch.Size([1, 25886]) Input IDs shape: torch.Size([1, 25886]) Labels shape: torch.Size([1, 25886]) Final batch size: 1, sequence length: 25758 Attention mask shape: torch.Size([1, 1, 25758, 25758]) Position ids shape: torch.Size([1, 25758]) Input IDs shape: torch.Size([1, 25758]) Labels shape: torch.Size([1, 25758]) Final batch size: 1, sequence length: 29948 Attention mask shape: torch.Size([1, 1, 29948, 29948]) Position ids shape: torch.Size([1, 29948]) Input IDs shape: torch.Size([1, 29948]) Labels shape: torch.Size([1, 29948]) Final batch size: 1, sequence length: 25707 Attention mask shape: torch.Size([1, 1, 25707, 25707]) Position ids shape: torch.Size([1, 25707]) Input IDs shape: torch.Size([1, 25707]) Labels shape: torch.Size([1, 25707]) Final batch size: 1, sequence length: 29749 Attention mask shape: torch.Size([1, 1, 29749, 29749]) Position ids shape: torch.Size([1, 29749]) Input IDs shape: torch.Size([1, 29749]) Labels shape: torch.Size([1, 29749]) Final batch size: 1, sequence length: 10318 Attention mask shape: torch.Size([1, 1, 10318, 10318]) Position ids shape: torch.Size([1, 10318]) Input IDs shape: torch.Size([1, 10318]) Labels shape: torch.Size([1, 10318]) Final batch size: 1, sequence length: 30079 Attention mask shape: torch.Size([1, 1, 30079, 30079]) Position ids shape: torch.Size([1, 30079]) Input IDs shape: torch.Size([1, 30079]) Labels shape: torch.Size([1, 30079]) Final batch size: 1, sequence length: 23698 Attention mask shape: torch.Size([1, 1, 23698, 23698]) Position ids shape: torch.Size([1, 23698]) Input IDs shape: torch.Size([1, 23698]) Labels shape: torch.Size([1, 23698]) Final batch size: 1, sequence length: 25719 Attention mask shape: torch.Size([1, 1, 25719, 25719]) Position ids shape: torch.Size([1, 25719]) Input IDs shape: torch.Size([1, 25719]) Labels shape: torch.Size([1, 25719]) Final batch size: 1, sequence length: 29825 Attention mask shape: torch.Size([1, 1, 29825, 29825]) Position ids shape: torch.Size([1, 29825]) Input IDs shape: torch.Size([1, 29825]) Labels shape: torch.Size([1, 29825]) Final batch size: 1, sequence length: 15294 Attention mask shape: torch.Size([1, 1, 15294, 15294]) Position ids shape: torch.Size([1, 15294]) Input IDs shape: torch.Size([1, 15294]) Labels shape: torch.Size([1, 15294]) Final batch size: 1, sequence length: 33611 Attention mask shape: torch.Size([1, 1, 33611, 33611]) Position ids shape: torch.Size([1, 33611]) Input IDs shape: torch.Size([1, 33611]) Labels shape: torch.Size([1, 33611]) Final batch size: 1, sequence length: 21374 Attention mask shape: torch.Size([1, 1, 21374, 21374]) Position ids shape: torch.Size([1, 21374]) Input IDs shape: torch.Size([1, 21374]) Labels shape: torch.Size([1, 21374]) Final batch size: 1, sequence length: 32541 Attention mask shape: torch.Size([1, 1, 32541, 32541]) Position ids shape: torch.Size([1, 32541]) Input IDs shape: torch.Size([1, 32541]) Labels shape: torch.Size([1, 32541]) Final batch size: 1, sequence length: 35760 Attention mask shape: torch.Size([1, 1, 35760, 35760]) Position ids shape: torch.Size([1, 35760]) Input IDs shape: torch.Size([1, 35760]) Labels shape: torch.Size([1, 35760]) Final batch size: 1, sequence length: 25492 Final batch size: 1, sequence length: 22139 Attention mask shape: torch.Size([1, 1, 25492, 25492]) Position ids shape: torch.Size([1, 25492]) Input IDs shape: torch.Size([1, 25492]) Labels shape: torch.Size([1, 25492]) Attention mask shape: torch.Size([1, 1, 22139, 22139]) Position ids shape: torch.Size([1, 22139]) Input IDs shape: torch.Size([1, 22139]) Labels shape: torch.Size([1, 22139]) Final batch size: 1, sequence length: 32071 Attention mask shape: torch.Size([1, 1, 32071, 32071]) Position ids shape: torch.Size([1, 32071]) Input IDs shape: torch.Size([1, 32071]) Labels shape: torch.Size([1, 32071]) Final batch size: 1, sequence length: 23399 Attention mask shape: torch.Size([1, 1, 23399, 23399]) Position ids shape: torch.Size([1, 23399]) Input IDs shape: torch.Size([1, 23399]) Labels shape: torch.Size([1, 23399]) Final batch size: 1, sequence length: 37241 Attention mask shape: torch.Size([1, 1, 37241, 37241]) Position ids shape: torch.Size([1, 37241]) Input IDs shape: torch.Size([1, 37241]) Labels shape: torch.Size([1, 37241]) Final batch size: 1, sequence length: 9897 Attention mask shape: torch.Size([1, 1, 9897, 9897]) Position ids shape: torch.Size([1, 9897]) Input IDs shape: torch.Size([1, 9897]) Labels shape: torch.Size([1, 9897]) Final batch size: 1, sequence length: 34186 Attention mask shape: torch.Size([1, 1, 34186, 34186]) Position ids shape: torch.Size([1, 34186]) Input IDs shape: torch.Size([1, 34186]) Labels shape: torch.Size([1, 34186]) Final batch size: 1, sequence length: 36456 Attention mask shape: torch.Size([1, 1, 36456, 36456]) Position ids shape: torch.Size([1, 36456]) Input IDs shape: torch.Size([1, 36456]) Labels shape: torch.Size([1, 36456]) Final batch size: 1, sequence length: 19238 Attention mask shape: torch.Size([1, 1, 19238, 19238]) Position ids shape: torch.Size([1, 19238]) Input IDs shape: torch.Size([1, 19238]) Labels shape: torch.Size([1, 19238]) Final batch size: 1, sequence length: 7681 Attention mask shape: torch.Size([1, 1, 7681, 7681]) Position ids shape: torch.Size([1, 7681]) Input IDs shape: torch.Size([1, 7681]) Labels shape: torch.Size([1, 7681]) Final batch size: 1, sequence length: 15604 Attention mask shape: torch.Size([1, 1, 15604, 15604]) Position ids shape: torch.Size([1, 15604]) Input IDs shape: torch.Size([1, 15604]) Labels shape: torch.Size([1, 15604]) Final batch size: 1, sequence length: 37728 Attention mask shape: torch.Size([1, 1, 37728, 37728]) Position ids shape: torch.Size([1, 37728]) Input IDs shape: torch.Size([1, 37728]) Labels shape: torch.Size([1, 37728]) Final batch size: 1, sequence length: 34701 Attention mask shape: torch.Size([1, 1, 34701, 34701]) Position ids shape: torch.Size([1, 34701]) Input IDs shape: torch.Size([1, 34701]) Labels shape: torch.Size([1, 34701]) Final batch size: 1, sequence length: 40593 Attention mask shape: torch.Size([1, 1, 40593, 40593]) Position ids shape: torch.Size([1, 40593]) Input IDs shape: torch.Size([1, 40593]) Labels shape: torch.Size([1, 40593]) Final batch size: 1, sequence length: 36778 Attention mask shape: torch.Size([1, 1, 36778, 36778]) Position ids shape: torch.Size([1, 36778]) Input IDs shape: torch.Size([1, 36778]) Labels shape: torch.Size([1, 36778]) Final batch size: 1, sequence length: 21557 Attention mask shape: torch.Size([1, 1, 21557, 21557]) Position ids shape: torch.Size([1, 21557]) Input IDs shape: torch.Size([1, 21557]) Labels shape: torch.Size([1, 21557]) Final batch size: 1, sequence length: 23343 Attention mask shape: torch.Size([1, 1, 23343, 23343]) Position ids shape: torch.Size([1, 23343]) Input IDs shape: torch.Size([1, 23343]) Labels shape: torch.Size([1, 23343]) Final batch size: 1, sequence length: 21955 Attention mask shape: torch.Size([1, 1, 21955, 21955]) Position ids shape: torch.Size([1, 21955]) Input IDs shape: torch.Size([1, 21955]) Labels shape: torch.Size([1, 21955]) Final batch size: 1, sequence length: 33459 Attention mask shape: torch.Size([1, 1, 33459, 33459]) Position ids shape: torch.Size([1, 33459]) Input IDs shape: torch.Size([1, 33459]) Labels shape: torch.Size([1, 33459]) Final batch size: 1, sequence length: 40147 Attention mask shape: torch.Size([1, 1, 40147, 40147]) Position ids shape: torch.Size([1, 40147]) Input IDs shape: torch.Size([1, 40147]) Labels shape: torch.Size([1, 40147]) Final batch size: 1, sequence length: 31650 Attention mask shape: torch.Size([1, 1, 31650, 31650]) Position ids shape: torch.Size([1, 31650]) Input IDs shape: torch.Size([1, 31650]) Labels shape: torch.Size([1, 31650]) Final batch size: 1, sequence length: 39760 Attention mask shape: torch.Size([1, 1, 39760, 39760]) Position ids shape: torch.Size([1, 39760]) Input IDs shape: torch.Size([1, 39760]) Labels shape: torch.Size([1, 39760]) Final batch size: 1, sequence length: 31323 Attention mask shape: torch.Size([1, 1, 31323, 31323]) Position ids shape: torch.Size([1, 31323]) Input IDs shape: torch.Size([1, 31323]) Labels shape: torch.Size([1, 31323]) Final batch size: 1, sequence length: 9445 Attention mask shape: torch.Size([1, 1, 9445, 9445]) Position ids shape: torch.Size([1, 9445]) Input IDs shape: torch.Size([1, 9445]) Labels shape: torch.Size([1, 9445]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 36579 Attention mask shape: torch.Size([1, 1, 36579, 36579]) Position ids shape: torch.Size([1, 36579]) Input IDs shape: torch.Size([1, 36579]) Labels shape: torch.Size([1, 36579]) Final batch size: 1, sequence length: 27283 Attention mask shape: torch.Size([1, 1, 27283, 27283]) Position ids shape: torch.Size([1, 27283]) Input IDs shape: torch.Size([1, 27283]) Labels shape: torch.Size([1, 27283]) Final batch size: 1, sequence length: 22491 Attention mask shape: torch.Size([1, 1, 22491, 22491]) Position ids shape: torch.Size([1, 22491]) Input IDs shape: torch.Size([1, 22491]) Labels shape: torch.Size([1, 22491]) Final batch size: 1, sequence length: 22434 Attention mask shape: torch.Size([1, 1, 22434, 22434]) Position ids shape: torch.Size([1, 22434]) Input IDs shape: torch.Size([1, 22434]) Labels shape: torch.Size([1, 22434]) Final batch size: 1, sequence length: 40507 Attention mask shape: torch.Size([1, 1, 40507, 40507]) Position ids shape: torch.Size([1, 40507]) Input IDs shape: torch.Size([1, 40507]) Labels shape: torch.Size([1, 40507]) Final batch size: 1, sequence length: 19867 Attention mask shape: torch.Size([1, 1, 19867, 19867]) Position ids shape: torch.Size([1, 19867]) Input IDs shape: torch.Size([1, 19867]) Labels shape: torch.Size([1, 19867]) Final batch size: 1, sequence length: 17646 Attention mask shape: torch.Size([1, 1, 17646, 17646]) Position ids shape: torch.Size([1, 17646]) Input IDs shape: torch.Size([1, 17646]) Labels shape: torch.Size([1, 17646]) Final batch size: 1, sequence length: 37183 Attention mask shape: torch.Size([1, 1, 37183, 37183]) Position ids shape: torch.Size([1, 37183]) Input IDs shape: torch.Size([1, 37183]) Labels shape: torch.Size([1, 37183]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 26664 Attention mask shape: torch.Size([1, 1, 26664, 26664]) Position ids shape: torch.Size([1, 26664]) Input IDs shape: torch.Size([1, 26664]) Labels shape: torch.Size([1, 26664]) Final batch size: 1, sequence length: 21489 Attention mask shape: torch.Size([1, 1, 21489, 21489]) Position ids shape: torch.Size([1, 21489]) Input IDs shape: torch.Size([1, 21489]) Labels shape: torch.Size([1, 21489]) Final batch size: 1, sequence length: 29082 Attention mask shape: torch.Size([1, 1, 29082, 29082]) Position ids shape: torch.Size([1, 29082]) Input IDs shape: torch.Size([1, 29082]) Labels shape: torch.Size([1, 29082]) Final batch size: 1, sequence length: 37775 Attention mask shape: torch.Size([1, 1, 37775, 37775]) Position ids shape: torch.Size([1, 37775]) Input IDs shape: torch.Size([1, 37775]) Labels shape: torch.Size([1, 37775]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 23014 Attention mask shape: torch.Size([1, 1, 23014, 23014]) Position ids shape: torch.Size([1, 23014]) Input IDs shape: torch.Size([1, 23014]) Labels shape: torch.Size([1, 23014]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 28661 Attention mask shape: torch.Size([1, 1, 28661, 28661]) Position ids shape: torch.Size([1, 28661]) Input IDs shape: torch.Size([1, 28661]) Labels shape: torch.Size([1, 28661]) Final batch size: 1, sequence length: 16273 Attention mask shape: torch.Size([1, 1, 16273, 16273]) Position ids shape: torch.Size([1, 16273]) Input IDs shape: torch.Size([1, 16273]) Labels shape: torch.Size([1, 16273]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 38686 Attention mask shape: torch.Size([1, 1, 38686, 38686]) Position ids shape: torch.Size([1, 38686]) Input IDs shape: torch.Size([1, 38686]) Labels shape: torch.Size([1, 38686]) Final batch size: 1, sequence length: 37091 Attention mask shape: torch.Size([1, 1, 37091, 37091]) Position ids shape: torch.Size([1, 37091]) Input IDs shape: torch.Size([1, 37091]) Labels shape: torch.Size([1, 37091]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 33263 Attention mask shape: torch.Size([1, 1, 33263, 33263]) Position ids shape: torch.Size([1, 33263]) Input IDs shape: torch.Size([1, 33263]) Labels shape: torch.Size([1, 33263]) Final batch size: 1, sequence length: 30364 Attention mask shape: torch.Size([1, 1, 30364, 30364]) Position ids shape: torch.Size([1, 30364]) Input IDs shape: torch.Size([1, 30364]) Labels shape: torch.Size([1, 30364]) Final batch size: 1, sequence length: 25914 Attention mask shape: torch.Size([1, 1, 25914, 25914]) Position ids shape: torch.Size([1, 25914]) Input IDs shape: torch.Size([1, 25914]) Labels shape: torch.Size([1, 25914]) Final batch size: 1, sequence length: 36297 Attention mask shape: torch.Size([1, 1, 36297, 36297]) Position ids shape: torch.Size([1, 36297]) Input IDs shape: torch.Size([1, 36297]) Labels shape: torch.Size([1, 36297]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) {'loss': 0.2536, 'grad_norm': 0.16570496292981496, 'learning_rate': 1.1142701927151456e-06, 'num_tokens': -inf, 'epoch': 6.5} Final batch size: 1, sequence length: 6519 Attention mask shape: torch.Size([1, 1, 6519, 6519]) Position ids shape: torch.Size([1, 6519]) Input IDs shape: torch.Size([1, 6519]) Labels shape: torch.Size([1, 6519]) Final batch size: 1, sequence length: 5525 Attention mask shape: torch.Size([1, 1, 5525, 5525]) Position ids shape: torch.Size([1, 5525]) Input IDs shape: torch.Size([1, 5525]) Labels shape: torch.Size([1, 5525]) Final batch size: 1, sequence length: 9181 Attention mask shape: torch.Size([1, 1, 9181, 9181]) Position ids shape: torch.Size([1, 9181]) Input IDs shape: torch.Size([1, 9181]) Labels shape: torch.Size([1, 9181]) Final batch size: 1, sequence length: 12281 Attention mask shape: torch.Size([1, 1, 12281, 12281]) Position ids shape: torch.Size([1, 12281]) Input IDs shape: torch.Size([1, 12281]) Labels shape: torch.Size([1, 12281]) Final batch size: 1, sequence length: 10273 Attention mask shape: torch.Size([1, 1, 10273, 10273]) Position ids shape: torch.Size([1, 10273]) Input IDs shape: torch.Size([1, 10273]) Labels shape: torch.Size([1, 10273]) Final batch size: 1, sequence length: 12927 Attention mask shape: torch.Size([1, 1, 12927, 12927]) Position ids shape: torch.Size([1, 12927]) Input IDs shape: torch.Size([1, 12927]) Labels shape: torch.Size([1, 12927]) Final batch size: 1, sequence length: 13385 Attention mask shape: torch.Size([1, 1, 13385, 13385]) Position ids shape: torch.Size([1, 13385]) Input IDs shape: torch.Size([1, 13385]) Labels shape: torch.Size([1, 13385]) Final batch size: 1, sequence length: 10905 Attention mask shape: torch.Size([1, 1, 10905, 10905]) Position ids shape: torch.Size([1, 10905]) Input IDs shape: torch.Size([1, 10905]) Labels shape: torch.Size([1, 10905]) Final batch size: 1, sequence length: 10408 Attention mask shape: torch.Size([1, 1, 10408, 10408]) Position ids shape: torch.Size([1, 10408]) Input IDs shape: torch.Size([1, 10408]) Labels shape: torch.Size([1, 10408]) Final batch size: 1, sequence length: 13363 Attention mask shape: torch.Size([1, 1, 13363, 13363]) Position ids shape: torch.Size([1, 13363]) Input IDs shape: torch.Size([1, 13363]) Labels shape: torch.Size([1, 13363]) Final batch size: 1, sequence length: 15257 Attention mask shape: torch.Size([1, 1, 15257, 15257]) Position ids shape: torch.Size([1, 15257]) Input IDs shape: torch.Size([1, 15257]) Labels shape: torch.Size([1, 15257]) Final batch size: 1, sequence length: 13804 Attention mask shape: torch.Size([1, 1, 13804, 13804]) Position ids shape: torch.Size([1, 13804]) Input IDs shape: torch.Size([1, 13804]) Labels shape: torch.Size([1, 13804]) Final batch size: 1, sequence length: 17003 Attention mask shape: torch.Size([1, 1, 17003, 17003]) Position ids shape: torch.Size([1, 17003]) Input IDs shape: torch.Size([1, 17003]) Labels shape: torch.Size([1, 17003]) Final batch size: 1, sequence length: 15518 Attention mask shape: torch.Size([1, 1, 15518, 15518]) Position ids shape: torch.Size([1, 15518]) Input IDs shape: torch.Size([1, 15518]) Labels shape: torch.Size([1, 15518]) Final batch size: 1, sequence length: 19768 Attention mask shape: torch.Size([1, 1, 19768, 19768]) Position ids shape: torch.Size([1, 19768]) Input IDs shape: torch.Size([1, 19768]) Labels shape: torch.Size([1, 19768]) Final batch size: 1, sequence length: 15244 Attention mask shape: torch.Size([1, 1, 15244, 15244]) Position ids shape: torch.Size([1, 15244]) Input IDs shape: torch.Size([1, 15244]) Labels shape: torch.Size([1, 15244]) Final batch size: 1, sequence length: 20089 Attention mask shape: torch.Size([1, 1, 20089, 20089]) Position ids shape: torch.Size([1, 20089]) Input IDs shape: torch.Size([1, 20089]) Labels shape: torch.Size([1, 20089]) Final batch size: 1, sequence length: 20979 Attention mask shape: torch.Size([1, 1, 20979, 20979]) Position ids shape: torch.Size([1, 20979]) Input IDs shape: torch.Size([1, 20979]) Labels shape: torch.Size([1, 20979]) Final batch size: 1, sequence length: 19286 Attention mask shape: torch.Size([1, 1, 19286, 19286]) Position ids shape: torch.Size([1, 19286]) Input IDs shape: torch.Size([1, 19286]) Labels shape: torch.Size([1, 19286]) Final batch size: 1, sequence length: 18645 Attention mask shape: torch.Size([1, 1, 18645, 18645]) Position ids shape: torch.Size([1, 18645]) Input IDs shape: torch.Size([1, 18645]) Labels shape: torch.Size([1, 18645]) Final batch size: 1, sequence length: 8646 Attention mask shape: torch.Size([1, 1, 8646, 8646]) Position ids shape: torch.Size([1, 8646]) Input IDs shape: torch.Size([1, 8646]) Labels shape: torch.Size([1, 8646]) Final batch size: 1, sequence length: 20106 Attention mask shape: torch.Size([1, 1, 20106, 20106]) Position ids shape: torch.Size([1, 20106]) Input IDs shape: torch.Size([1, 20106]) Labels shape: torch.Size([1, 20106]) Final batch size: 1, sequence length: 16520 Attention mask shape: torch.Size([1, 1, 16520, 16520]) Position ids shape: torch.Size([1, 16520]) Input IDs shape: torch.Size([1, 16520]) Labels shape: torch.Size([1, 16520]) Final batch size: 1, sequence length: 5405 Attention mask shape: torch.Size([1, 1, 5405, 5405]) Position ids shape: torch.Size([1, 5405]) Input IDs shape: torch.Size([1, 5405]) Labels shape: torch.Size([1, 5405]) Final batch size: 1, sequence length: 23886 Attention mask shape: torch.Size([1, 1, 23886, 23886]) Position ids shape: torch.Size([1, 23886]) Input IDs shape: torch.Size([1, 23886]) Labels shape: torch.Size([1, 23886]) Final batch size: 1, sequence length: 22777 Attention mask shape: torch.Size([1, 1, 22777, 22777]) Position ids shape: torch.Size([1, 22777]) Input IDs shape: torch.Size([1, 22777]) Labels shape: torch.Size([1, 22777]) Final batch size: 1, sequence length: 14429 Attention mask shape: torch.Size([1, 1, 14429, 14429]) Position ids shape: torch.Size([1, 14429]) Input IDs shape: torch.Size([1, 14429]) Labels shape: torch.Size([1, 14429]) Final batch size: 1, sequence length: 18819 Attention mask shape: torch.Size([1, 1, 18819, 18819]) Position ids shape: torch.Size([1, 18819]) Input IDs shape: torch.Size([1, 18819]) Labels shape: torch.Size([1, 18819]) Final batch size: 1, sequence length: 19513 Attention mask shape: torch.Size([1, 1, 19513, 19513]) Position ids shape: torch.Size([1, 19513]) Input IDs shape: torch.Size([1, 19513]) Labels shape: torch.Size([1, 19513]) Final batch size: 1, sequence length: 23995 Attention mask shape: torch.Size([1, 1, 23995, 23995]) Position ids shape: torch.Size([1, 23995]) Input IDs shape: torch.Size([1, 23995]) Labels shape: torch.Size([1, 23995]) Final batch size: 1, sequence length: 25001 Attention mask shape: torch.Size([1, 1, 25001, 25001]) Position ids shape: torch.Size([1, 25001]) Input IDs shape: torch.Size([1, 25001]) Labels shape: torch.Size([1, 25001]) Final batch size: 1, sequence length: 22311 Attention mask shape: torch.Size([1, 1, 22311, 22311]) Position ids shape: torch.Size([1, 22311]) Input IDs shape: torch.Size([1, 22311]) Labels shape: torch.Size([1, 22311]) Final batch size: 1, sequence length: 23341 Attention mask shape: torch.Size([1, 1, 23341, 23341]) Position ids shape: torch.Size([1, 23341]) Input IDs shape: torch.Size([1, 23341]) Labels shape: torch.Size([1, 23341]) Final batch size: 1, sequence length: 22160 Attention mask shape: torch.Size([1, 1, 22160, 22160]) Position ids shape: torch.Size([1, 22160]) Input IDs shape: torch.Size([1, 22160]) Labels shape: torch.Size([1, 22160]) Final batch size: 1, sequence length: 23942 Attention mask shape: torch.Size([1, 1, 23942, 23942]) Position ids shape: torch.Size([1, 23942]) Input IDs shape: torch.Size([1, 23942]) Labels shape: torch.Size([1, 23942]) Final batch size: 1, sequence length: 25021 Attention mask shape: torch.Size([1, 1, 25021, 25021]) Position ids shape: torch.Size([1, 25021]) Input IDs shape: torch.Size([1, 25021]) Labels shape: torch.Size([1, 25021]) Final batch size: 1, sequence length: 22915 Attention mask shape: torch.Size([1, 1, 22915, 22915]) Position ids shape: torch.Size([1, 22915]) Input IDs shape: torch.Size([1, 22915]) Labels shape: torch.Size([1, 22915]) Final batch size: 1, sequence length: 23724 Attention mask shape: torch.Size([1, 1, 23724, 23724]) Position ids shape: torch.Size([1, 23724]) Input IDs shape: torch.Size([1, 23724]) Labels shape: torch.Size([1, 23724]) Final batch size: 1, sequence length: 24293 Attention mask shape: torch.Size([1, 1, 24293, 24293]) Position ids shape: torch.Size([1, 24293]) Input IDs shape: torch.Size([1, 24293]) Labels shape: torch.Size([1, 24293]) Final batch size: 1, sequence length: 18988 Attention mask shape: torch.Size([1, 1, 18988, 18988]) Position ids shape: torch.Size([1, 18988]) Input IDs shape: torch.Size([1, 18988]) Labels shape: torch.Size([1, 18988]) Final batch size: 1, sequence length: 5288 Attention mask shape: torch.Size([1, 1, 5288, 5288]) Position ids shape: torch.Size([1, 5288]) Input IDs shape: torch.Size([1, 5288]) Labels shape: torch.Size([1, 5288]) Final batch size: 1, sequence length: 24287 Attention mask shape: torch.Size([1, 1, 24287, 24287]) Position ids shape: torch.Size([1, 24287]) Input IDs shape: torch.Size([1, 24287]) Labels shape: torch.Size([1, 24287]) Final batch size: 1, sequence length: 22979 Attention mask shape: torch.Size([1, 1, 22979, 22979]) Position ids shape: torch.Size([1, 22979]) Input IDs shape: torch.Size([1, 22979]) Labels shape: torch.Size([1, 22979]) Final batch size: 1, sequence length: 11266 Attention mask shape: torch.Size([1, 1, 11266, 11266]) Position ids shape: torch.Size([1, 11266]) Input IDs shape: torch.Size([1, 11266]) Labels shape: torch.Size([1, 11266]) Final batch size: 1, sequence length: 26886 Attention mask shape: torch.Size([1, 1, 26886, 26886]) Position ids shape: torch.Size([1, 26886]) Input IDs shape: torch.Size([1, 26886]) Labels shape: torch.Size([1, 26886]) Final batch size: 1, sequence length: 28263 Attention mask shape: torch.Size([1, 1, 28263, 28263]) Position ids shape: torch.Size([1, 28263]) Input IDs shape: torch.Size([1, 28263]) Labels shape: torch.Size([1, 28263]) Final batch size: 1, sequence length: 26461 Attention mask shape: torch.Size([1, 1, 26461, 26461]) Position ids shape: torch.Size([1, 26461]) Input IDs shape: torch.Size([1, 26461]) Labels shape: torch.Size([1, 26461]) Final batch size: 1, sequence length: 29009 Attention mask shape: torch.Size([1, 1, 29009, 29009]) Position ids shape: torch.Size([1, 29009]) Input IDs shape: torch.Size([1, 29009]) Labels shape: torch.Size([1, 29009]) Final batch size: 1, sequence length: 21660 Attention mask shape: torch.Size([1, 1, 21660, 21660]) Position ids shape: torch.Size([1, 21660]) Input IDs shape: torch.Size([1, 21660]) Labels shape: torch.Size([1, 21660]) Final batch size: 1, sequence length: 12483 Attention mask shape: torch.Size([1, 1, 12483, 12483]) Position ids shape: torch.Size([1, 12483]) Input IDs shape: torch.Size([1, 12483]) Labels shape: torch.Size([1, 12483]) Final batch size: 1, sequence length: 20941 Attention mask shape: torch.Size([1, 1, 20941, 20941]) Position ids shape: torch.Size([1, 20941]) Input IDs shape: torch.Size([1, 20941]) Labels shape: torch.Size([1, 20941]) Final batch size: 1, sequence length: 24407 Attention mask shape: torch.Size([1, 1, 24407, 24407]) Position ids shape: torch.Size([1, 24407]) Input IDs shape: torch.Size([1, 24407]) Labels shape: torch.Size([1, 24407]) Final batch size: 1, sequence length: 26179 Attention mask shape: torch.Size([1, 1, 26179, 26179]) Position ids shape: torch.Size([1, 26179]) Input IDs shape: torch.Size([1, 26179]) Labels shape: torch.Size([1, 26179]) Final batch size: 1, sequence length: 29152 Attention mask shape: torch.Size([1, 1, 29152, 29152]) Position ids shape: torch.Size([1, 29152]) Input IDs shape: torch.Size([1, 29152]) Labels shape: torch.Size([1, 29152]) Final batch size: 1, sequence length: 30723 Attention mask shape: torch.Size([1, 1, 30723, 30723]) Position ids shape: torch.Size([1, 30723]) Input IDs shape: torch.Size([1, 30723]) Labels shape: torch.Size([1, 30723]) Final batch size: 1, sequence length: 28440 Attention mask shape: torch.Size([1, 1, 28440, 28440]) Position ids shape: torch.Size([1, 28440]) Input IDs shape: torch.Size([1, 28440]) Labels shape: torch.Size([1, 28440]) Final batch size: 1, sequence length: 29109 Attention mask shape: torch.Size([1, 1, 29109, 29109]) Position ids shape: torch.Size([1, 29109]) Input IDs shape: torch.Size([1, 29109]) Labels shape: torch.Size([1, 29109]) Final batch size: 1, sequence length: 26054 Attention mask shape: torch.Size([1, 1, 26054, 26054]) Position ids shape: torch.Size([1, 26054]) Input IDs shape: torch.Size([1, 26054]) Labels shape: torch.Size([1, 26054]) Final batch size: 1, sequence length: 17951 Attention mask shape: torch.Size([1, 1, 17951, 17951]) Position ids shape: torch.Size([1, 17951]) Input IDs shape: torch.Size([1, 17951]) Labels shape: torch.Size([1, 17951]) Final batch size: 1, sequence length: 32083 Attention mask shape: torch.Size([1, 1, 32083, 32083]) Position ids shape: torch.Size([1, 32083]) Input IDs shape: torch.Size([1, 32083]) Labels shape: torch.Size([1, 32083]) Final batch size: 1, sequence length: 30236 Attention mask shape: torch.Size([1, 1, 30236, 30236]) Position ids shape: torch.Size([1, 30236]) Input IDs shape: torch.Size([1, 30236]) Labels shape: torch.Size([1, 30236]) Final batch size: 1, sequence length: 29561 Attention mask shape: torch.Size([1, 1, 29561, 29561]) Position ids shape: torch.Size([1, 29561]) Input IDs shape: torch.Size([1, 29561]) Labels shape: torch.Size([1, 29561]) Final batch size: 1, sequence length: 15830 Attention mask shape: torch.Size([1, 1, 15830, 15830]) Position ids shape: torch.Size([1, 15830]) Input IDs shape: torch.Size([1, 15830]) Labels shape: torch.Size([1, 15830]) Final batch size: 1, sequence length: 13600 Attention mask shape: torch.Size([1, 1, 13600, 13600]) Position ids shape: torch.Size([1, 13600]) Input IDs shape: torch.Size([1, 13600]) Labels shape: torch.Size([1, 13600]) Final batch size: 1, sequence length: 32799 Attention mask shape: torch.Size([1, 1, 32799, 32799]) Position ids shape: torch.Size([1, 32799]) Input IDs shape: torch.Size([1, 32799]) Labels shape: torch.Size([1, 32799]) Final batch size: 1, sequence length: 27405 Attention mask shape: torch.Size([1, 1, 27405, 27405]) Position ids shape: torch.Size([1, 27405]) Input IDs shape: torch.Size([1, 27405]) Labels shape: torch.Size([1, 27405]) Final batch size: 1, sequence length: 32660 Attention mask shape: torch.Size([1, 1, 32660, 32660]) Position ids shape: torch.Size([1, 32660]) Input IDs shape: torch.Size([1, 32660]) Labels shape: torch.Size([1, 32660]) Final batch size: 1, sequence length: 21472 Attention mask shape: torch.Size([1, 1, 21472, 21472]) Position ids shape: torch.Size([1, 21472]) Input IDs shape: torch.Size([1, 21472]) Labels shape: torch.Size([1, 21472]) Final batch size: 1, sequence length: 17539 Attention mask shape: torch.Size([1, 1, 17539, 17539]) Position ids shape: torch.Size([1, 17539]) Input IDs shape: torch.Size([1, 17539]) Labels shape: torch.Size([1, 17539]) Final batch size: 1, sequence length: 34694 Attention mask shape: torch.Size([1, 1, 34694, 34694]) Position ids shape: torch.Size([1, 34694]) Input IDs shape: torch.Size([1, 34694]) Labels shape: torch.Size([1, 34694]) Final batch size: 1, sequence length: 32287 Attention mask shape: torch.Size([1, 1, 32287, 32287]) Position ids shape: torch.Size([1, 32287]) Input IDs shape: torch.Size([1, 32287]) Labels shape: torch.Size([1, 32287]) Final batch size: 1, sequence length: 17649 Attention mask shape: torch.Size([1, 1, 17649, 17649]) Position ids shape: torch.Size([1, 17649]) Input IDs shape: torch.Size([1, 17649]) Labels shape: torch.Size([1, 17649]) Final batch size: 1, sequence length: 29237 Attention mask shape: torch.Size([1, 1, 29237, 29237]) Position ids shape: torch.Size([1, 29237]) Input IDs shape: torch.Size([1, 29237]) Labels shape: torch.Size([1, 29237]) Final batch size: 1, sequence length: 24056 Attention mask shape: torch.Size([1, 1, 24056, 24056]) Position ids shape: torch.Size([1, 24056]) Input IDs shape: torch.Size([1, 24056]) Labels shape: torch.Size([1, 24056]) Final batch size: 1, sequence length: 31512 Attention mask shape: torch.Size([1, 1, 31512, 31512]) Position ids shape: torch.Size([1, 31512]) Input IDs shape: torch.Size([1, 31512]) Labels shape: torch.Size([1, 31512]) Final batch size: 1, sequence length: 15906 Attention mask shape: torch.Size([1, 1, 15906, 15906]) Position ids shape: torch.Size([1, 15906]) Input IDs shape: torch.Size([1, 15906]) Labels shape: torch.Size([1, 15906]) Final batch size: 1, sequence length: 29880 Attention mask shape: torch.Size([1, 1, 29880, 29880]) Position ids shape: torch.Size([1, 29880]) Input IDs shape: torch.Size([1, 29880]) Labels shape: torch.Size([1, 29880]) Final batch size: 1, sequence length: 23945 Attention mask shape: torch.Size([1, 1, 23945, 23945]) Position ids shape: torch.Size([1, 23945]) Input IDs shape: torch.Size([1, 23945]) Labels shape: torch.Size([1, 23945]) Final batch size: 1, sequence length: 26452 Attention mask shape: torch.Size([1, 1, 26452, 26452]) Position ids shape: torch.Size([1, 26452]) Input IDs shape: torch.Size([1, 26452]) Labels shape: torch.Size([1, 26452]) Final batch size: 1, sequence length: 8008 Attention mask shape: torch.Size([1, 1, 8008, 8008]) Position ids shape: torch.Size([1, 8008]) Input IDs shape: torch.Size([1, 8008]) Labels shape: torch.Size([1, 8008]) Final batch size: 1, sequence length: 17775 Attention mask shape: torch.Size([1, 1, 17775, 17775]) Position ids shape: torch.Size([1, 17775]) Input IDs shape: torch.Size([1, 17775]) Labels shape: torch.Size([1, 17775]) Final batch size: 1, sequence length: 27987 Attention mask shape: torch.Size([1, 1, 27987, 27987]) Position ids shape: torch.Size([1, 27987]) Input IDs shape: torch.Size([1, 27987]) Labels shape: torch.Size([1, 27987]) Final batch size: 1, sequence length: 7448 Attention mask shape: torch.Size([1, 1, 7448, 7448]) Position ids shape: torch.Size([1, 7448]) Input IDs shape: torch.Size([1, 7448]) Labels shape: torch.Size([1, 7448]) Final batch size: 1, sequence length: 34777 Attention mask shape: torch.Size([1, 1, 34777, 34777]) Position ids shape: torch.Size([1, 34777]) Input IDs shape: torch.Size([1, 34777]) Labels shape: torch.Size([1, 34777]) Final batch size: 1, sequence length: 35478 Attention mask shape: torch.Size([1, 1, 35478, 35478]) Position ids shape: torch.Size([1, 35478]) Input IDs shape: torch.Size([1, 35478]) Labels shape: torch.Size([1, 35478]) Final batch size: 1, sequence length: 35696 Attention mask shape: torch.Size([1, 1, 35696, 35696]) Position ids shape: torch.Size([1, 35696]) Input IDs shape: torch.Size([1, 35696]) Labels shape: torch.Size([1, 35696]) Final batch size: 1, sequence length: 16631 Attention mask shape: torch.Size([1, 1, 16631, 16631]) Position ids shape: torch.Size([1, 16631]) Input IDs shape: torch.Size([1, 16631]) Labels shape: torch.Size([1, 16631]) Final batch size: 1, sequence length: 16852 Attention mask shape: torch.Size([1, 1, 16852, 16852]) Position ids shape: torch.Size([1, 16852]) Input IDs shape: torch.Size([1, 16852]) Labels shape: torch.Size([1, 16852]) Final batch size: 1, sequence length: 29262 Attention mask shape: torch.Size([1, 1, 29262, 29262]) Position ids shape: torch.Size([1, 29262]) Input IDs shape: torch.Size([1, 29262]) Labels shape: torch.Size([1, 29262]) Final batch size: 1, sequence length: 31786 Attention mask shape: torch.Size([1, 1, 31786, 31786]) Position ids shape: torch.Size([1, 31786]) Input IDs shape: torch.Size([1, 31786]) Labels shape: torch.Size([1, 31786]) Final batch size: 1, sequence length: 35999 Attention mask shape: torch.Size([1, 1, 35999, 35999]) Position ids shape: torch.Size([1, 35999]) Input IDs shape: torch.Size([1, 35999]) Labels shape: torch.Size([1, 35999]) Final batch size: 1, sequence length: 28879 Attention mask shape: torch.Size([1, 1, 28879, 28879]) Position ids shape: torch.Size([1, 28879]) Input IDs shape: torch.Size([1, 28879]) Labels shape: torch.Size([1, 28879]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 24824 Attention mask shape: torch.Size([1, 1, 24824, 24824]) Position ids shape: torch.Size([1, 24824]) Input IDs shape: torch.Size([1, 24824]) Labels shape: torch.Size([1, 24824]) Final batch size: 1, sequence length: 28691 Attention mask shape: torch.Size([1, 1, 28691, 28691]) Position ids shape: torch.Size([1, 28691]) Input IDs shape: torch.Size([1, 28691]) Labels shape: torch.Size([1, 28691]) Final batch size: 1, sequence length: 36314 Attention mask shape: torch.Size([1, 1, 36314, 36314]) Position ids shape: torch.Size([1, 36314]) Input IDs shape: torch.Size([1, 36314]) Labels shape: torch.Size([1, 36314]) Final batch size: 1, sequence length: 37933 Attention mask shape: torch.Size([1, 1, 37933, 37933]) Position ids shape: torch.Size([1, 37933]) Input IDs shape: torch.Size([1, 37933]) Labels shape: torch.Size([1, 37933]) Final batch size: 1, sequence length: 39476 Attention mask shape: torch.Size([1, 1, 39476, 39476]) Position ids shape: torch.Size([1, 39476]) Input IDs shape: torch.Size([1, 39476]) Labels shape: torch.Size([1, 39476]) Final batch size: 1, sequence length: 20694 Attention mask shape: torch.Size([1, 1, 20694, 20694]) Position ids shape: torch.Size([1, 20694]) Input IDs shape: torch.Size([1, 20694]) Labels shape: torch.Size([1, 20694]) Final batch size: 1, sequence length: 6378 Attention mask shape: torch.Size([1, 1, 6378, 6378]) Position ids shape: torch.Size([1, 6378]) Input IDs shape: torch.Size([1, 6378]) Labels shape: torch.Size([1, 6378]) Final batch size: 1, sequence length: 10480 Attention mask shape: torch.Size([1, 1, 10480, 10480]) Position ids shape: torch.Size([1, 10480]) Input IDs shape: torch.Size([1, 10480]) Labels shape: torch.Size([1, 10480]) Final batch size: 1, sequence length: 31132 Attention mask shape: torch.Size([1, 1, 31132, 31132]) Position ids shape: torch.Size([1, 31132]) Input IDs shape: torch.Size([1, 31132]) Labels shape: torch.Size([1, 31132]) Final batch size: 1, sequence length: 38935 Attention mask shape: torch.Size([1, 1, 38935, 38935]) Position ids shape: torch.Size([1, 38935]) Input IDs shape: torch.Size([1, 38935]) Labels shape: torch.Size([1, 38935]) Final batch size: 1, sequence length: 35526 Attention mask shape: torch.Size([1, 1, 35526, 35526]) Position ids shape: torch.Size([1, 35526]) Input IDs shape: torch.Size([1, 35526]) Labels shape: torch.Size([1, 35526]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 28495 Attention mask shape: torch.Size([1, 1, 28495, 28495]) Position ids shape: torch.Size([1, 28495]) Input IDs shape: torch.Size([1, 28495]) Labels shape: torch.Size([1, 28495]) Final batch size: 1, sequence length: 11395 Attention mask shape: torch.Size([1, 1, 11395, 11395]) Position ids shape: torch.Size([1, 11395]) Input IDs shape: torch.Size([1, 11395]) Labels shape: torch.Size([1, 11395]) Final batch size: 1, sequence length: 14869 Attention mask shape: torch.Size([1, 1, 14869, 14869]) Position ids shape: torch.Size([1, 14869]) Input IDs shape: torch.Size([1, 14869]) Labels shape: torch.Size([1, 14869]) Final batch size: 1, sequence length: 20176 Attention mask shape: torch.Size([1, 1, 20176, 20176]) Position ids shape: torch.Size([1, 20176]) Input IDs shape: torch.Size([1, 20176]) Labels shape: torch.Size([1, 20176]) Final batch size: 1, sequence length: 35116 Attention mask shape: torch.Size([1, 1, 35116, 35116]) Position ids shape: torch.Size([1, 35116]) Input IDs shape: torch.Size([1, 35116]) Labels shape: torch.Size([1, 35116]) Final batch size: 1, sequence length: 18911 Attention mask shape: torch.Size([1, 1, 18911, 18911]) Position ids shape: torch.Size([1, 18911]) Input IDs shape: torch.Size([1, 18911]) Labels shape: torch.Size([1, 18911]) Final batch size: 1, sequence length: 37159 Attention mask shape: torch.Size([1, 1, 37159, 37159]) Position ids shape: torch.Size([1, 37159]) Input IDs shape: torch.Size([1, 37159]) Labels shape: torch.Size([1, 37159]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 18127 Attention mask shape: torch.Size([1, 1, 18127, 18127]) Position ids shape: torch.Size([1, 18127]) Input IDs shape: torch.Size([1, 18127]) Labels shape: torch.Size([1, 18127]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40605 Attention mask shape: torch.Size([1, 1, 40605, 40605]) Position ids shape: torch.Size([1, 40605]) Input IDs shape: torch.Size([1, 40605]) Labels shape: torch.Size([1, 40605]) Final batch size: 1, sequence length: 30024 Attention mask shape: torch.Size([1, 1, 30024, 30024]) Position ids shape: torch.Size([1, 30024]) Input IDs shape: torch.Size([1, 30024]) Labels shape: torch.Size([1, 30024]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 28313 Attention mask shape: torch.Size([1, 1, 28313, 28313]) Position ids shape: torch.Size([1, 28313]) Input IDs shape: torch.Size([1, 28313]) Labels shape: torch.Size([1, 28313]) Final batch size: 1, sequence length: 36469 Attention mask shape: torch.Size([1, 1, 36469, 36469]) Position ids shape: torch.Size([1, 36469]) Input IDs shape: torch.Size([1, 36469]) Labels shape: torch.Size([1, 36469]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 39687 Attention mask shape: torch.Size([1, 1, 39687, 39687]) Position ids shape: torch.Size([1, 39687]) Input IDs shape: torch.Size([1, 39687]) Labels shape: torch.Size([1, 39687]) Final batch size: 1, sequence length: 39324 Attention mask shape: torch.Size([1, 1, 39324, 39324]) Position ids shape: torch.Size([1, 39324]) Input IDs shape: torch.Size([1, 39324]) Labels shape: torch.Size([1, 39324]) Final batch size: 1, sequence length: 30245 Attention mask shape: torch.Size([1, 1, 30245, 30245]) Position ids shape: torch.Size([1, 30245]) Input IDs shape: torch.Size([1, 30245]) Labels shape: torch.Size([1, 30245]) Final batch size: 1, sequence length: 33070 Attention mask shape: torch.Size([1, 1, 33070, 33070]) Position ids shape: torch.Size([1, 33070]) Input IDs shape: torch.Size([1, 33070]) Labels shape: torch.Size([1, 33070]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 26402 Attention mask shape: torch.Size([1, 1, 26402, 26402]) Position ids shape: torch.Size([1, 26402]) Input IDs shape: torch.Size([1, 26402]) Labels shape: torch.Size([1, 26402]) {'loss': 0.2376, 'grad_norm': 0.17172191357936073, 'learning_rate': 9.549150281252633e-07, 'num_tokens': -inf, 'epoch': 6.62} Final batch size: 1, sequence length: 3968 Attention mask shape: torch.Size([1, 1, 3968, 3968]) Position ids shape: torch.Size([1, 3968]) Input IDs shape: torch.Size([1, 3968]) Labels shape: torch.Size([1, 3968]) Final batch size: 1, sequence length: 8347 Attention mask shape: torch.Size([1, 1, 8347, 8347]) Position ids shape: torch.Size([1, 8347]) Input IDs shape: torch.Size([1, 8347]) Labels shape: torch.Size([1, 8347]) Final batch size: 1, sequence length: 5997 Attention mask shape: torch.Size([1, 1, 5997, 5997]) Position ids shape: torch.Size([1, 5997]) Input IDs shape: torch.Size([1, 5997]) Labels shape: torch.Size([1, 5997]) Final batch size: 1, sequence length: 9027 Attention mask shape: torch.Size([1, 1, 9027, 9027]) Position ids shape: torch.Size([1, 9027]) Input IDs shape: torch.Size([1, 9027]) Labels shape: torch.Size([1, 9027]) Final batch size: 1, sequence length: 11752 Attention mask shape: torch.Size([1, 1, 11752, 11752]) Position ids shape: torch.Size([1, 11752]) Input IDs shape: torch.Size([1, 11752]) Labels shape: torch.Size([1, 11752]) Final batch size: 1, sequence length: 11232 Attention mask shape: torch.Size([1, 1, 11232, 11232]) Position ids shape: torch.Size([1, 11232]) Input IDs shape: torch.Size([1, 11232]) Labels shape: torch.Size([1, 11232]) Final batch size: 1, sequence length: 6567 Attention mask shape: torch.Size([1, 1, 6567, 6567]) Position ids shape: torch.Size([1, 6567]) Input IDs shape: torch.Size([1, 6567]) Labels shape: torch.Size([1, 6567]) Final batch size: 1, sequence length: 11880 Attention mask shape: torch.Size([1, 1, 11880, 11880]) Position ids shape: torch.Size([1, 11880]) Input IDs shape: torch.Size([1, 11880]) Labels shape: torch.Size([1, 11880]) Final batch size: 1, sequence length: 11054 Attention mask shape: torch.Size([1, 1, 11054, 11054]) Position ids shape: torch.Size([1, 11054]) Input IDs shape: torch.Size([1, 11054]) Labels shape: torch.Size([1, 11054]) Final batch size: 1, sequence length: 13260 Attention mask shape: torch.Size([1, 1, 13260, 13260]) Position ids shape: torch.Size([1, 13260]) Input IDs shape: torch.Size([1, 13260]) Labels shape: torch.Size([1, 13260]) Final batch size: 1, sequence length: 12370 Attention mask shape: torch.Size([1, 1, 12370, 12370]) Position ids shape: torch.Size([1, 12370]) Input IDs shape: torch.Size([1, 12370]) Labels shape: torch.Size([1, 12370]) Final batch size: 1, sequence length: 12967 Attention mask shape: torch.Size([1, 1, 12967, 12967]) Position ids shape: torch.Size([1, 12967]) Input IDs shape: torch.Size([1, 12967]) Labels shape: torch.Size([1, 12967]) Final batch size: 1, sequence length: 14758 Attention mask shape: torch.Size([1, 1, 14758, 14758]) Position ids shape: torch.Size([1, 14758]) Input IDs shape: torch.Size([1, 14758]) Labels shape: torch.Size([1, 14758]) Final batch size: 1, sequence length: 17166 Attention mask shape: torch.Size([1, 1, 17166, 17166]) Position ids shape: torch.Size([1, 17166]) Input IDs shape: torch.Size([1, 17166]) Labels shape: torch.Size([1, 17166]) Final batch size: 1, sequence length: 14597 Attention mask shape: torch.Size([1, 1, 14597, 14597]) Position ids shape: torch.Size([1, 14597]) Input IDs shape: torch.Size([1, 14597]) Labels shape: torch.Size([1, 14597]) Final batch size: 1, sequence length: 15438 Attention mask shape: torch.Size([1, 1, 15438, 15438]) Position ids shape: torch.Size([1, 15438]) Input IDs shape: torch.Size([1, 15438]) Labels shape: torch.Size([1, 15438]) Final batch size: 1, sequence length: 11206 Attention mask shape: torch.Size([1, 1, 11206, 11206]) Position ids shape: torch.Size([1, 11206]) Input IDs shape: torch.Size([1, 11206]) Labels shape: torch.Size([1, 11206]) Final batch size: 1, sequence length: 17092 Attention mask shape: torch.Size([1, 1, 17092, 17092]) Position ids shape: torch.Size([1, 17092]) Input IDs shape: torch.Size([1, 17092]) Labels shape: torch.Size([1, 17092]) Final batch size: 1, sequence length: 18131 Attention mask shape: torch.Size([1, 1, 18131, 18131]) Position ids shape: torch.Size([1, 18131]) Input IDs shape: torch.Size([1, 18131]) Labels shape: torch.Size([1, 18131]) Final batch size: 1, sequence length: 16014 Attention mask shape: torch.Size([1, 1, 16014, 16014]) Position ids shape: torch.Size([1, 16014]) Input IDs shape: torch.Size([1, 16014]) Labels shape: torch.Size([1, 16014]) Final batch size: 1, sequence length: 14512 Attention mask shape: torch.Size([1, 1, 14512, 14512]) Position ids shape: torch.Size([1, 14512]) Input IDs shape: torch.Size([1, 14512]) Labels shape: torch.Size([1, 14512]) Final batch size: 1, sequence length: 14520 Attention mask shape: torch.Size([1, 1, 14520, 14520]) Position ids shape: torch.Size([1, 14520]) Input IDs shape: torch.Size([1, 14520]) Labels shape: torch.Size([1, 14520]) Final batch size: 1, sequence length: 18264 Attention mask shape: torch.Size([1, 1, 18264, 18264]) Position ids shape: torch.Size([1, 18264]) Input IDs shape: torch.Size([1, 18264]) Labels shape: torch.Size([1, 18264]) Final batch size: 1, sequence length: 20487 Attention mask shape: torch.Size([1, 1, 20487, 20487]) Position ids shape: torch.Size([1, 20487]) Input IDs shape: torch.Size([1, 20487]) Labels shape: torch.Size([1, 20487]) Final batch size: 1, sequence length: 17839 Attention mask shape: torch.Size([1, 1, 17839, 17839]) Position ids shape: torch.Size([1, 17839]) Input IDs shape: torch.Size([1, 17839]) Labels shape: torch.Size([1, 17839]) Final batch size: 1, sequence length: 17194 Attention mask shape: torch.Size([1, 1, 17194, 17194]) Position ids shape: torch.Size([1, 17194]) Input IDs shape: torch.Size([1, 17194]) Labels shape: torch.Size([1, 17194]) Final batch size: 1, sequence length: 18050 Attention mask shape: torch.Size([1, 1, 18050, 18050]) Position ids shape: torch.Size([1, 18050]) Input IDs shape: torch.Size([1, 18050]) Labels shape: torch.Size([1, 18050]) Final batch size: 1, sequence length: 21137 Attention mask shape: torch.Size([1, 1, 21137, 21137]) Position ids shape: torch.Size([1, 21137]) Input IDs shape: torch.Size([1, 21137]) Labels shape: torch.Size([1, 21137]) Final batch size: 1, sequence length: 13061 Attention mask shape: torch.Size([1, 1, 13061, 13061]) Position ids shape: torch.Size([1, 13061]) Input IDs shape: torch.Size([1, 13061]) Labels shape: torch.Size([1, 13061]) Final batch size: 1, sequence length: 18950 Attention mask shape: torch.Size([1, 1, 18950, 18950]) Position ids shape: torch.Size([1, 18950]) Input IDs shape: torch.Size([1, 18950]) Labels shape: torch.Size([1, 18950]) Final batch size: 1, sequence length: 14335 Attention mask shape: torch.Size([1, 1, 14335, 14335]) Position ids shape: torch.Size([1, 14335]) Input IDs shape: torch.Size([1, 14335]) Labels shape: torch.Size([1, 14335]) Final batch size: 1, sequence length: 20947 Attention mask shape: torch.Size([1, 1, 20947, 20947]) Position ids shape: torch.Size([1, 20947]) Input IDs shape: torch.Size([1, 20947]) Labels shape: torch.Size([1, 20947]) Final batch size: 1, sequence length: 22107 Attention mask shape: torch.Size([1, 1, 22107, 22107]) Position ids shape: torch.Size([1, 22107]) Input IDs shape: torch.Size([1, 22107]) Labels shape: torch.Size([1, 22107]) Final batch size: 1, sequence length: 23324 Attention mask shape: torch.Size([1, 1, 23324, 23324]) Position ids shape: torch.Size([1, 23324]) Input IDs shape: torch.Size([1, 23324]) Labels shape: torch.Size([1, 23324]) Final batch size: 1, sequence length: 22547 Attention mask shape: torch.Size([1, 1, 22547, 22547]) Position ids shape: torch.Size([1, 22547]) Input IDs shape: torch.Size([1, 22547]) Labels shape: torch.Size([1, 22547]) Final batch size: 1, sequence length: 24566 Attention mask shape: torch.Size([1, 1, 24566, 24566]) Position ids shape: torch.Size([1, 24566]) Input IDs shape: torch.Size([1, 24566]) Labels shape: torch.Size([1, 24566]) Final batch size: 1, sequence length: 26068 Attention mask shape: torch.Size([1, 1, 26068, 26068]) Position ids shape: torch.Size([1, 26068]) Input IDs shape: torch.Size([1, 26068]) Labels shape: torch.Size([1, 26068]) Final batch size: 1, sequence length: 24432 Attention mask shape: torch.Size([1, 1, 24432, 24432]) Position ids shape: torch.Size([1, 24432]) Input IDs shape: torch.Size([1, 24432]) Labels shape: torch.Size([1, 24432]) Final batch size: 1, sequence length: 17115 Attention mask shape: torch.Size([1, 1, 17115, 17115]) Position ids shape: torch.Size([1, 17115]) Input IDs shape: torch.Size([1, 17115]) Labels shape: torch.Size([1, 17115]) Final batch size: 1, sequence length: 10286 Attention mask shape: torch.Size([1, 1, 10286, 10286]) Position ids shape: torch.Size([1, 10286]) Input IDs shape: torch.Size([1, 10286]) Labels shape: torch.Size([1, 10286]) Final batch size: 1, sequence length: 26333 Attention mask shape: torch.Size([1, 1, 26333, 26333]) Position ids shape: torch.Size([1, 26333]) Input IDs shape: torch.Size([1, 26333]) Labels shape: torch.Size([1, 26333]) Final batch size: 1, sequence length: 23960 Attention mask shape: torch.Size([1, 1, 23960, 23960]) Position ids shape: torch.Size([1, 23960]) Input IDs shape: torch.Size([1, 23960]) Labels shape: torch.Size([1, 23960]) Final batch size: 1, sequence length: 15229 Attention mask shape: torch.Size([1, 1, 15229, 15229]) Position ids shape: torch.Size([1, 15229]) Input IDs shape: torch.Size([1, 15229]) Labels shape: torch.Size([1, 15229]) Final batch size: 1, sequence length: 28284 Attention mask shape: torch.Size([1, 1, 28284, 28284]) Position ids shape: torch.Size([1, 28284]) Input IDs shape: torch.Size([1, 28284]) Labels shape: torch.Size([1, 28284]) Final batch size: 1, sequence length: 26271 Attention mask shape: torch.Size([1, 1, 26271, 26271]) Position ids shape: torch.Size([1, 26271]) Input IDs shape: torch.Size([1, 26271]) Labels shape: torch.Size([1, 26271]) Final batch size: 1, sequence length: 29768 Attention mask shape: torch.Size([1, 1, 29768, 29768]) Position ids shape: torch.Size([1, 29768]) Input IDs shape: torch.Size([1, 29768]) Labels shape: torch.Size([1, 29768]) Final batch size: 1, sequence length: 21728 Attention mask shape: torch.Size([1, 1, 21728, 21728]) Position ids shape: torch.Size([1, 21728]) Input IDs shape: torch.Size([1, 21728]) Labels shape: torch.Size([1, 21728]) Final batch size: 1, sequence length: 18051 Attention mask shape: torch.Size([1, 1, 18051, 18051]) Position ids shape: torch.Size([1, 18051]) Input IDs shape: torch.Size([1, 18051]) Labels shape: torch.Size([1, 18051]) Final batch size: 1, sequence length: 25252 Attention mask shape: torch.Size([1, 1, 25252, 25252]) Position ids shape: torch.Size([1, 25252]) Input IDs shape: torch.Size([1, 25252]) Labels shape: torch.Size([1, 25252]) Final batch size: 1, sequence length: 27293 Attention mask shape: torch.Size([1, 1, 27293, 27293]) Position ids shape: torch.Size([1, 27293]) Input IDs shape: torch.Size([1, 27293]) Labels shape: torch.Size([1, 27293]) Final batch size: 1, sequence length: 29478 Attention mask shape: torch.Size([1, 1, 29478, 29478]) Position ids shape: torch.Size([1, 29478]) Input IDs shape: torch.Size([1, 29478]) Labels shape: torch.Size([1, 29478]) Final batch size: 1, sequence length: 29592 Attention mask shape: torch.Size([1, 1, 29592, 29592]) Position ids shape: torch.Size([1, 29592]) Input IDs shape: torch.Size([1, 29592]) Labels shape: torch.Size([1, 29592]) Final batch size: 1, sequence length: 26937 Attention mask shape: torch.Size([1, 1, 26937, 26937]) Position ids shape: torch.Size([1, 26937]) Input IDs shape: torch.Size([1, 26937]) Labels shape: torch.Size([1, 26937]) Final batch size: 1, sequence length: 31377 Attention mask shape: torch.Size([1, 1, 31377, 31377]) Position ids shape: torch.Size([1, 31377]) Input IDs shape: torch.Size([1, 31377]) Labels shape: torch.Size([1, 31377]) Final batch size: 1, sequence length: 29236 Attention mask shape: torch.Size([1, 1, 29236, 29236]) Position ids shape: torch.Size([1, 29236]) Input IDs shape: torch.Size([1, 29236]) Labels shape: torch.Size([1, 29236]) Final batch size: 1, sequence length: 27278 Attention mask shape: torch.Size([1, 1, 27278, 27278]) Position ids shape: torch.Size([1, 27278]) Input IDs shape: torch.Size([1, 27278]) Labels shape: torch.Size([1, 27278]) Final batch size: 1, sequence length: 30862 Attention mask shape: torch.Size([1, 1, 30862, 30862]) Position ids shape: torch.Size([1, 30862]) Input IDs shape: torch.Size([1, 30862]) Labels shape: torch.Size([1, 30862]) Final batch size: 1, sequence length: 28858 Attention mask shape: torch.Size([1, 1, 28858, 28858]) Position ids shape: torch.Size([1, 28858]) Input IDs shape: torch.Size([1, 28858]) Labels shape: torch.Size([1, 28858]) Final batch size: 1, sequence length: 7364 Attention mask shape: torch.Size([1, 1, 7364, 7364]) Position ids shape: torch.Size([1, 7364]) Input IDs shape: torch.Size([1, 7364]) Labels shape: torch.Size([1, 7364]) Final batch size: 1, sequence length: 31860 Attention mask shape: torch.Size([1, 1, 31860, 31860]) Position ids shape: torch.Size([1, 31860]) Input IDs shape: torch.Size([1, 31860]) Labels shape: torch.Size([1, 31860]) Final batch size: 1, sequence length: 32112 Attention mask shape: torch.Size([1, 1, 32112, 32112]) Position ids shape: torch.Size([1, 32112]) Input IDs shape: torch.Size([1, 32112]) Labels shape: torch.Size([1, 32112]) Final batch size: 1, sequence length: 31740 Attention mask shape: torch.Size([1, 1, 31740, 31740]) Position ids shape: torch.Size([1, 31740]) Input IDs shape: torch.Size([1, 31740]) Labels shape: torch.Size([1, 31740]) Final batch size: 1, sequence length: 32392 Attention mask shape: torch.Size([1, 1, 32392, 32392]) Position ids shape: torch.Size([1, 32392]) Input IDs shape: torch.Size([1, 32392]) Labels shape: torch.Size([1, 32392]) Final batch size: 1, sequence length: 28421 Attention mask shape: torch.Size([1, 1, 28421, 28421]) Position ids shape: torch.Size([1, 28421]) Input IDs shape: torch.Size([1, 28421]) Labels shape: torch.Size([1, 28421]) Final batch size: 1, sequence length: 33736 Attention mask shape: torch.Size([1, 1, 33736, 33736]) Position ids shape: torch.Size([1, 33736]) Input IDs shape: torch.Size([1, 33736]) Labels shape: torch.Size([1, 33736]) Final batch size: 1, sequence length: 13257 Attention mask shape: torch.Size([1, 1, 13257, 13257]) Position ids shape: torch.Size([1, 13257]) Input IDs shape: torch.Size([1, 13257]) Labels shape: torch.Size([1, 13257]) Final batch size: 1, sequence length: 19888 Attention mask shape: torch.Size([1, 1, 19888, 19888]) Position ids shape: torch.Size([1, 19888]) Input IDs shape: torch.Size([1, 19888]) Labels shape: torch.Size([1, 19888]) Final batch size: 1, sequence length: 18408 Attention mask shape: torch.Size([1, 1, 18408, 18408]) Position ids shape: torch.Size([1, 18408]) Input IDs shape: torch.Size([1, 18408]) Labels shape: torch.Size([1, 18408]) Final batch size: 1, sequence length: 9974 Attention mask shape: torch.Size([1, 1, 9974, 9974]) Position ids shape: torch.Size([1, 9974]) Input IDs shape: torch.Size([1, 9974]) Labels shape: torch.Size([1, 9974]) Final batch size: 1, sequence length: 29489 Attention mask shape: torch.Size([1, 1, 29489, 29489]) Position ids shape: torch.Size([1, 29489]) Input IDs shape: torch.Size([1, 29489]) Labels shape: torch.Size([1, 29489]) Final batch size: 1, sequence length: 33097 Attention mask shape: torch.Size([1, 1, 33097, 33097]) Position ids shape: torch.Size([1, 33097]) Input IDs shape: torch.Size([1, 33097]) Labels shape: torch.Size([1, 33097]) Final batch size: 1, sequence length: 32181 Attention mask shape: torch.Size([1, 1, 32181, 32181]) Position ids shape: torch.Size([1, 32181]) Input IDs shape: torch.Size([1, 32181]) Labels shape: torch.Size([1, 32181]) Final batch size: 1, sequence length: 35596 Attention mask shape: torch.Size([1, 1, 35596, 35596]) Position ids shape: torch.Size([1, 35596]) Input IDs shape: torch.Size([1, 35596]) Labels shape: torch.Size([1, 35596]) Final batch size: 1, sequence length: 32162 Attention mask shape: torch.Size([1, 1, 32162, 32162]) Position ids shape: torch.Size([1, 32162]) Input IDs shape: torch.Size([1, 32162]) Labels shape: torch.Size([1, 32162]) Final batch size: 1, sequence length: 37623 Attention mask shape: torch.Size([1, 1, 37623, 37623]) Position ids shape: torch.Size([1, 37623]) Input IDs shape: torch.Size([1, 37623]) Labels shape: torch.Size([1, 37623]) Final batch size: 1, sequence length: 35069 Attention mask shape: torch.Size([1, 1, 35069, 35069]) Position ids shape: torch.Size([1, 35069]) Input IDs shape: torch.Size([1, 35069]) Labels shape: torch.Size([1, 35069]) Final batch size: 1, sequence length: 25484 Attention mask shape: torch.Size([1, 1, 25484, 25484]) Position ids shape: torch.Size([1, 25484]) Input IDs shape: torch.Size([1, 25484]) Labels shape: torch.Size([1, 25484]) Final batch size: 1, sequence length: 34514 Attention mask shape: torch.Size([1, 1, 34514, 34514]) Position ids shape: torch.Size([1, 34514]) Input IDs shape: torch.Size([1, 34514]) Labels shape: torch.Size([1, 34514]) Final batch size: 1, sequence length: 30302 Attention mask shape: torch.Size([1, 1, 30302, 30302]) Position ids shape: torch.Size([1, 30302]) Input IDs shape: torch.Size([1, 30302]) Labels shape: torch.Size([1, 30302]) Final batch size: 1, sequence length: 35171 Attention mask shape: torch.Size([1, 1, 35171, 35171]) Position ids shape: torch.Size([1, 35171]) Input IDs shape: torch.Size([1, 35171]) Labels shape: torch.Size([1, 35171]) Final batch size: 1, sequence length: 38790 Attention mask shape: torch.Size([1, 1, 38790, 38790]) Position ids shape: torch.Size([1, 38790]) Input IDs shape: torch.Size([1, 38790]) Labels shape: torch.Size([1, 38790]) Final batch size: 1, sequence length: 18711 Attention mask shape: torch.Size([1, 1, 18711, 18711]) Position ids shape: torch.Size([1, 18711]) Input IDs shape: torch.Size([1, 18711]) Labels shape: torch.Size([1, 18711]) Final batch size: 1, sequence length: 32979 Attention mask shape: torch.Size([1, 1, 32979, 32979]) Position ids shape: torch.Size([1, 32979]) Input IDs shape: torch.Size([1, 32979]) Labels shape: torch.Size([1, 32979]) Final batch size: 1, sequence length: 17777 Attention mask shape: torch.Size([1, 1, 17777, 17777]) Position ids shape: torch.Size([1, 17777]) Input IDs shape: torch.Size([1, 17777]) Labels shape: torch.Size([1, 17777]) Final batch size: 1, sequence length: 21500 Attention mask shape: torch.Size([1, 1, 21500, 21500]) Position ids shape: torch.Size([1, 21500]) Input IDs shape: torch.Size([1, 21500]) Labels shape: torch.Size([1, 21500]) Final batch size: 1, sequence length: 30051 Attention mask shape: torch.Size([1, 1, 30051, 30051]) Position ids shape: torch.Size([1, 30051]) Input IDs shape: torch.Size([1, 30051]) Labels shape: torch.Size([1, 30051]) Final batch size: 1, sequence length: 32327 Attention mask shape: torch.Size([1, 1, 32327, 32327]) Position ids shape: torch.Size([1, 32327]) Input IDs shape: torch.Size([1, 32327]) Labels shape: torch.Size([1, 32327]) Final batch size: 1, sequence length: 39754 Attention mask shape: torch.Size([1, 1, 39754, 39754]) Position ids shape: torch.Size([1, 39754]) Input IDs shape: torch.Size([1, 39754]) Labels shape: torch.Size([1, 39754]) Final batch size: 1, sequence length: 21408 Attention mask shape: torch.Size([1, 1, 21408, 21408]) Position ids shape: torch.Size([1, 21408]) Input IDs shape: torch.Size([1, 21408]) Labels shape: torch.Size([1, 21408]) Final batch size: 1, sequence length: 13557 Attention mask shape: torch.Size([1, 1, 13557, 13557]) Position ids shape: torch.Size([1, 13557]) Input IDs shape: torch.Size([1, 13557]) Labels shape: torch.Size([1, 13557]) Final batch size: 1, sequence length: 38351 Attention mask shape: torch.Size([1, 1, 38351, 38351]) Position ids shape: torch.Size([1, 38351]) Input IDs shape: torch.Size([1, 38351]) Labels shape: torch.Size([1, 38351]) Final batch size: 1, sequence length: 24061 Attention mask shape: torch.Size([1, 1, 24061, 24061]) Position ids shape: torch.Size([1, 24061]) Input IDs shape: torch.Size([1, 24061]) Labels shape: torch.Size([1, 24061]) Final batch size: 1, sequence length: 31679 Attention mask shape: torch.Size([1, 1, 31679, 31679]) Position ids shape: torch.Size([1, 31679]) Input IDs shape: torch.Size([1, 31679]) Labels shape: torch.Size([1, 31679]) Final batch size: 1, sequence length: 30639 Attention mask shape: torch.Size([1, 1, 30639, 30639]) Position ids shape: torch.Size([1, 30639]) Input IDs shape: torch.Size([1, 30639]) Labels shape: torch.Size([1, 30639]) Final batch size: 1, sequence length: 36535 Attention mask shape: torch.Size([1, 1, 36535, 36535]) Position ids shape: torch.Size([1, 36535]) Input IDs shape: torch.Size([1, 36535]) Labels shape: torch.Size([1, 36535]) Final batch size: 1, sequence length: 34170 Attention mask shape: torch.Size([1, 1, 34170, 34170]) Position ids shape: torch.Size([1, 34170]) Input IDs shape: torch.Size([1, 34170]) Labels shape: torch.Size([1, 34170]) Final batch size: 1, sequence length: 24943 Attention mask shape: torch.Size([1, 1, 24943, 24943]) Position ids shape: torch.Size([1, 24943]) Input IDs shape: torch.Size([1, 24943]) Labels shape: torch.Size([1, 24943]) Final batch size: 1, sequence length: 25386 Attention mask shape: torch.Size([1, 1, 25386, 25386]) Position ids shape: torch.Size([1, 25386]) Input IDs shape: torch.Size([1, 25386]) Labels shape: torch.Size([1, 25386]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 34871 Attention mask shape: torch.Size([1, 1, 34871, 34871]) Position ids shape: torch.Size([1, 34871]) Input IDs shape: torch.Size([1, 34871]) Labels shape: torch.Size([1, 34871]) Final batch size: 1, sequence length: 15697 Attention mask shape: torch.Size([1, 1, 15697, 15697]) Position ids shape: torch.Size([1, 15697]) Input IDs shape: torch.Size([1, 15697]) Labels shape: torch.Size([1, 15697]) Final batch size: 1, sequence length: 21825 Attention mask shape: torch.Size([1, 1, 21825, 21825]) Position ids shape: torch.Size([1, 21825]) Input IDs shape: torch.Size([1, 21825]) Labels shape: torch.Size([1, 21825]) Final batch size: 1, sequence length: 33807 Attention mask shape: torch.Size([1, 1, 33807, 33807]) Position ids shape: torch.Size([1, 33807]) Input IDs shape: torch.Size([1, 33807]) Labels shape: torch.Size([1, 33807]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 24910 Attention mask shape: torch.Size([1, 1, 24910, 24910]) Position ids shape: torch.Size([1, 24910]) Input IDs shape: torch.Size([1, 24910]) Labels shape: torch.Size([1, 24910]) Final batch size: 1, sequence length: 18106 Attention mask shape: torch.Size([1, 1, 18106, 18106]) Position ids shape: torch.Size([1, 18106]) Input IDs shape: torch.Size([1, 18106]) Labels shape: torch.Size([1, 18106]) Final batch size: 1, sequence length: 24840 Attention mask shape: torch.Size([1, 1, 24840, 24840]) Position ids shape: torch.Size([1, 24840]) Input IDs shape: torch.Size([1, 24840]) Labels shape: torch.Size([1, 24840]) Final batch size: 1, sequence length: 23243 Attention mask shape: torch.Size([1, 1, 23243, 23243]) Position ids shape: torch.Size([1, 23243]) Input IDs shape: torch.Size([1, 23243]) Labels shape: torch.Size([1, 23243]) Final batch size: 1, sequence length: 31513 Attention mask shape: torch.Size([1, 1, 31513, 31513]) Position ids shape: torch.Size([1, 31513]) Input IDs shape: torch.Size([1, 31513]) Labels shape: torch.Size([1, 31513]) Final batch size: 1, sequence length: 30361 Attention mask shape: torch.Size([1, 1, 30361, 30361]) Position ids shape: torch.Size([1, 30361]) Input IDs shape: torch.Size([1, 30361]) Labels shape: torch.Size([1, 30361]) Final batch size: 1, sequence length: 33883 Attention mask shape: torch.Size([1, 1, 33883, 33883]) Position ids shape: torch.Size([1, 33883]) Input IDs shape: torch.Size([1, 33883]) Labels shape: torch.Size([1, 33883]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 15180 Attention mask shape: torch.Size([1, 1, 15180, 15180]) Position ids shape: torch.Size([1, 15180]) Input IDs shape: torch.Size([1, 15180]) Labels shape: torch.Size([1, 15180]) Final batch size: 1, sequence length: 29691 Attention mask shape: torch.Size([1, 1, 29691, 29691]) Position ids shape: torch.Size([1, 29691]) Input IDs shape: torch.Size([1, 29691]) Labels shape: torch.Size([1, 29691]) Final batch size: 1, sequence length: 39483 Attention mask shape: torch.Size([1, 1, 39483, 39483]) Position ids shape: torch.Size([1, 39483]) Input IDs shape: torch.Size([1, 39483]) Labels shape: torch.Size([1, 39483]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 32351 Attention mask shape: torch.Size([1, 1, 32351, 32351]) Position ids shape: torch.Size([1, 32351]) Input IDs shape: torch.Size([1, 32351]) Labels shape: torch.Size([1, 32351]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 33549 Attention mask shape: torch.Size([1, 1, 33549, 33549]) Position ids shape: torch.Size([1, 33549]) Input IDs shape: torch.Size([1, 33549]) Labels shape: torch.Size([1, 33549]) Final batch size: 1, sequence length: 30419 Attention mask shape: torch.Size([1, 1, 30419, 30419]) Position ids shape: torch.Size([1, 30419]) Input IDs shape: torch.Size([1, 30419]) Labels shape: torch.Size([1, 30419]) Final batch size: 1, sequence length: 27228 Attention mask shape: torch.Size([1, 1, 27228, 27228]) Position ids shape: torch.Size([1, 27228]) Input IDs shape: torch.Size([1, 27228]) Labels shape: torch.Size([1, 27228]) Final batch size: 1, sequence length: 13064 Attention mask shape: torch.Size([1, 1, 13064, 13064]) Position ids shape: torch.Size([1, 13064]) Input IDs shape: torch.Size([1, 13064]) Labels shape: torch.Size([1, 13064]) Final batch size: 1, sequence length: 36532 Attention mask shape: torch.Size([1, 1, 36532, 36532]) Position ids shape: torch.Size([1, 36532]) Input IDs shape: torch.Size([1, 36532]) Labels shape: torch.Size([1, 36532]) {'loss': 0.2503, 'grad_norm': 0.17358485275238264, 'learning_rate': 8.066471602728804e-07, 'num_tokens': -inf, 'epoch': 6.75} Final batch size: 1, sequence length: 7235 Attention mask shape: torch.Size([1, 1, 7235, 7235]) Position ids shape: torch.Size([1, 7235]) Input IDs shape: torch.Size([1, 7235]) Labels shape: torch.Size([1, 7235]) Final batch size: 1, sequence length: 13575 Attention mask shape: torch.Size([1, 1, 13575, 13575]) Position ids shape: torch.Size([1, 13575]) Input IDs shape: torch.Size([1, 13575]) Labels shape: torch.Size([1, 13575]) Final batch size: 1, sequence length: 8845 Attention mask shape: torch.Size([1, 1, 8845, 8845]) Position ids shape: torch.Size([1, 8845]) Input IDs shape: torch.Size([1, 8845]) Labels shape: torch.Size([1, 8845]) Final batch size: 1, sequence length: 12562 Attention mask shape: torch.Size([1, 1, 12562, 12562]) Position ids shape: torch.Size([1, 12562]) Input IDs shape: torch.Size([1, 12562]) Labels shape: torch.Size([1, 12562]) Final batch size: 1, sequence length: 12960 Attention mask shape: torch.Size([1, 1, 12960, 12960]) Position ids shape: torch.Size([1, 12960]) Input IDs shape: torch.Size([1, 12960]) Labels shape: torch.Size([1, 12960]) Final batch size: 1, sequence length: 14689 Attention mask shape: torch.Size([1, 1, 14689, 14689]) Position ids shape: torch.Size([1, 14689]) Input IDs shape: torch.Size([1, 14689]) Labels shape: torch.Size([1, 14689]) Final batch size: 1, sequence length: 12215 Attention mask shape: torch.Size([1, 1, 12215, 12215]) Position ids shape: torch.Size([1, 12215]) Input IDs shape: torch.Size([1, 12215]) Labels shape: torch.Size([1, 12215]) Final batch size: 1, sequence length: 12622 Attention mask shape: torch.Size([1, 1, 12622, 12622]) Position ids shape: torch.Size([1, 12622]) Input IDs shape: torch.Size([1, 12622]) Labels shape: torch.Size([1, 12622]) Final batch size: 1, sequence length: 12830 Attention mask shape: torch.Size([1, 1, 12830, 12830]) Position ids shape: torch.Size([1, 12830]) Input IDs shape: torch.Size([1, 12830]) Labels shape: torch.Size([1, 12830]) Final batch size: 1, sequence length: 13665 Attention mask shape: torch.Size([1, 1, 13665, 13665]) Position ids shape: torch.Size([1, 13665]) Input IDs shape: torch.Size([1, 13665]) Labels shape: torch.Size([1, 13665]) Final batch size: 1, sequence length: 15673 Attention mask shape: torch.Size([1, 1, 15673, 15673]) Position ids shape: torch.Size([1, 15673]) Input IDs shape: torch.Size([1, 15673]) Labels shape: torch.Size([1, 15673]) Final batch size: 1, sequence length: 14833 Attention mask shape: torch.Size([1, 1, 14833, 14833]) Position ids shape: torch.Size([1, 14833]) Input IDs shape: torch.Size([1, 14833]) Labels shape: torch.Size([1, 14833]) Final batch size: 1, sequence length: 16145 Attention mask shape: torch.Size([1, 1, 16145, 16145]) Position ids shape: torch.Size([1, 16145]) Input IDs shape: torch.Size([1, 16145]) Labels shape: torch.Size([1, 16145]) Final batch size: 1, sequence length: 17026 Attention mask shape: torch.Size([1, 1, 17026, 17026]) Position ids shape: torch.Size([1, 17026]) Input IDs shape: torch.Size([1, 17026]) Labels shape: torch.Size([1, 17026]) Final batch size: 1, sequence length: 14482 Attention mask shape: torch.Size([1, 1, 14482, 14482]) Position ids shape: torch.Size([1, 14482]) Input IDs shape: torch.Size([1, 14482]) Labels shape: torch.Size([1, 14482]) Final batch size: 1, sequence length: 15816 Attention mask shape: torch.Size([1, 1, 15816, 15816]) Position ids shape: torch.Size([1, 15816]) Input IDs shape: torch.Size([1, 15816]) Labels shape: torch.Size([1, 15816]) Final batch size: 1, sequence length: 17016 Attention mask shape: torch.Size([1, 1, 17016, 17016]) Position ids shape: torch.Size([1, 17016]) Input IDs shape: torch.Size([1, 17016]) Labels shape: torch.Size([1, 17016]) Final batch size: 1, sequence length: 17512 Attention mask shape: torch.Size([1, 1, 17512, 17512]) Position ids shape: torch.Size([1, 17512]) Input IDs shape: torch.Size([1, 17512]) Labels shape: torch.Size([1, 17512]) Final batch size: 1, sequence length: 20784 Attention mask shape: torch.Size([1, 1, 20784, 20784]) Position ids shape: torch.Size([1, 20784]) Input IDs shape: torch.Size([1, 20784]) Labels shape: torch.Size([1, 20784]) Final batch size: 1, sequence length: 19225 Final batch size: 1, sequence length: 17988 Attention mask shape: torch.Size([1, 1, 19225, 19225]) Position ids shape: torch.Size([1, 19225]) Input IDs shape: torch.Size([1, 19225]) Labels shape: torch.Size([1, 19225]) Attention mask shape: torch.Size([1, 1, 17988, 17988]) Position ids shape: torch.Size([1, 17988]) Input IDs shape: torch.Size([1, 17988]) Labels shape: torch.Size([1, 17988]) Final batch size: 1, sequence length: 18922 Attention mask shape: torch.Size([1, 1, 18922, 18922]) Position ids shape: torch.Size([1, 18922]) Input IDs shape: torch.Size([1, 18922]) Labels shape: torch.Size([1, 18922]) Final batch size: 1, sequence length: 20991 Attention mask shape: torch.Size([1, 1, 20991, 20991]) Position ids shape: torch.Size([1, 20991]) Input IDs shape: torch.Size([1, 20991]) Labels shape: torch.Size([1, 20991]) Final batch size: 1, sequence length: 20630 Attention mask shape: torch.Size([1, 1, 20630, 20630]) Position ids shape: torch.Size([1, 20630]) Input IDs shape: torch.Size([1, 20630]) Labels shape: torch.Size([1, 20630]) Final batch size: 1, sequence length: 18978 Attention mask shape: torch.Size([1, 1, 18978, 18978]) Position ids shape: torch.Size([1, 18978]) Input IDs shape: torch.Size([1, 18978]) Labels shape: torch.Size([1, 18978]) Final batch size: 1, sequence length: 20389 Attention mask shape: torch.Size([1, 1, 20389, 20389]) Position ids shape: torch.Size([1, 20389]) Input IDs shape: torch.Size([1, 20389]) Labels shape: torch.Size([1, 20389]) Final batch size: 1, sequence length: 10494 Attention mask shape: torch.Size([1, 1, 10494, 10494]) Position ids shape: torch.Size([1, 10494]) Input IDs shape: torch.Size([1, 10494]) Labels shape: torch.Size([1, 10494]) Final batch size: 1, sequence length: 19409 Attention mask shape: torch.Size([1, 1, 19409, 19409]) Position ids shape: torch.Size([1, 19409]) Input IDs shape: torch.Size([1, 19409]) Labels shape: torch.Size([1, 19409]) Final batch size: 1, sequence length: 15222 Attention mask shape: torch.Size([1, 1, 15222, 15222]) Position ids shape: torch.Size([1, 15222]) Input IDs shape: torch.Size([1, 15222]) Labels shape: torch.Size([1, 15222]) Final batch size: 1, sequence length: 19325 Attention mask shape: torch.Size([1, 1, 19325, 19325]) Position ids shape: torch.Size([1, 19325]) Input IDs shape: torch.Size([1, 19325]) Labels shape: torch.Size([1, 19325]) Final batch size: 1, sequence length: 20585 Attention mask shape: torch.Size([1, 1, 20585, 20585]) Position ids shape: torch.Size([1, 20585]) Input IDs shape: torch.Size([1, 20585]) Labels shape: torch.Size([1, 20585]) Final batch size: 1, sequence length: 22949 Attention mask shape: torch.Size([1, 1, 22949, 22949]) Position ids shape: torch.Size([1, 22949]) Input IDs shape: torch.Size([1, 22949]) Labels shape: torch.Size([1, 22949]) Final batch size: 1, sequence length: 6618 Attention mask shape: torch.Size([1, 1, 6618, 6618]) Position ids shape: torch.Size([1, 6618]) Input IDs shape: torch.Size([1, 6618]) Labels shape: torch.Size([1, 6618]) Final batch size: 1, sequence length: 12969 Attention mask shape: torch.Size([1, 1, 12969, 12969]) Position ids shape: torch.Size([1, 12969]) Input IDs shape: torch.Size([1, 12969]) Labels shape: torch.Size([1, 12969]) Final batch size: 1, sequence length: 21598 Attention mask shape: torch.Size([1, 1, 21598, 21598]) Position ids shape: torch.Size([1, 21598]) Input IDs shape: torch.Size([1, 21598]) Labels shape: torch.Size([1, 21598]) Final batch size: 1, sequence length: 21075 Attention mask shape: torch.Size([1, 1, 21075, 21075]) Position ids shape: torch.Size([1, 21075]) Input IDs shape: torch.Size([1, 21075]) Labels shape: torch.Size([1, 21075]) Final batch size: 1, sequence length: 23293 Attention mask shape: torch.Size([1, 1, 23293, 23293]) Position ids shape: torch.Size([1, 23293]) Input IDs shape: torch.Size([1, 23293]) Labels shape: torch.Size([1, 23293]) Final batch size: 1, sequence length: 14577 Attention mask shape: torch.Size([1, 1, 14577, 14577]) Position ids shape: torch.Size([1, 14577]) Input IDs shape: torch.Size([1, 14577]) Labels shape: torch.Size([1, 14577]) Final batch size: 1, sequence length: 21907 Attention mask shape: torch.Size([1, 1, 21907, 21907]) Position ids shape: torch.Size([1, 21907]) Input IDs shape: torch.Size([1, 21907]) Labels shape: torch.Size([1, 21907]) Final batch size: 1, sequence length: 23755 Attention mask shape: torch.Size([1, 1, 23755, 23755]) Position ids shape: torch.Size([1, 23755]) Input IDs shape: torch.Size([1, 23755]) Labels shape: torch.Size([1, 23755]) Final batch size: 1, sequence length: 22778 Attention mask shape: torch.Size([1, 1, 22778, 22778]) Position ids shape: torch.Size([1, 22778]) Input IDs shape: torch.Size([1, 22778]) Labels shape: torch.Size([1, 22778]) Final batch size: 1, sequence length: 23805 Attention mask shape: torch.Size([1, 1, 23805, 23805]) Position ids shape: torch.Size([1, 23805]) Input IDs shape: torch.Size([1, 23805]) Labels shape: torch.Size([1, 23805]) Final batch size: 1, sequence length: 21322 Attention mask shape: torch.Size([1, 1, 21322, 21322]) Position ids shape: torch.Size([1, 21322]) Input IDs shape: torch.Size([1, 21322]) Labels shape: torch.Size([1, 21322]) Final batch size: 1, sequence length: 23437 Attention mask shape: torch.Size([1, 1, 23437, 23437]) Position ids shape: torch.Size([1, 23437]) Input IDs shape: torch.Size([1, 23437]) Labels shape: torch.Size([1, 23437]) Final batch size: 1, sequence length: 3010 Attention mask shape: torch.Size([1, 1, 3010, 3010]) Position ids shape: torch.Size([1, 3010]) Input IDs shape: torch.Size([1, 3010]) Labels shape: torch.Size([1, 3010]) Final batch size: 1, sequence length: 19724 Attention mask shape: torch.Size([1, 1, 19724, 19724]) Position ids shape: torch.Size([1, 19724]) Input IDs shape: torch.Size([1, 19724]) Labels shape: torch.Size([1, 19724]) Final batch size: 1, sequence length: 20465 Attention mask shape: torch.Size([1, 1, 20465, 20465]) Position ids shape: torch.Size([1, 20465]) Input IDs shape: torch.Size([1, 20465]) Labels shape: torch.Size([1, 20465]) Final batch size: 1, sequence length: 24107 Attention mask shape: torch.Size([1, 1, 24107, 24107]) Position ids shape: torch.Size([1, 24107]) Input IDs shape: torch.Size([1, 24107]) Labels shape: torch.Size([1, 24107]) Final batch size: 1, sequence length: 14548 Attention mask shape: torch.Size([1, 1, 14548, 14548]) Position ids shape: torch.Size([1, 14548]) Input IDs shape: torch.Size([1, 14548]) Labels shape: torch.Size([1, 14548]) Final batch size: 1, sequence length: 13072 Attention mask shape: torch.Size([1, 1, 13072, 13072]) Position ids shape: torch.Size([1, 13072]) Input IDs shape: torch.Size([1, 13072]) Labels shape: torch.Size([1, 13072]) Final batch size: 1, sequence length: 26424 Attention mask shape: torch.Size([1, 1, 26424, 26424]) Position ids shape: torch.Size([1, 26424]) Input IDs shape: torch.Size([1, 26424]) Labels shape: torch.Size([1, 26424]) Final batch size: 1, sequence length: 25934 Attention mask shape: torch.Size([1, 1, 25934, 25934]) Position ids shape: torch.Size([1, 25934]) Input IDs shape: torch.Size([1, 25934]) Labels shape: torch.Size([1, 25934]) Final batch size: 1, sequence length: 28474 Attention mask shape: torch.Size([1, 1, 28474, 28474]) Position ids shape: torch.Size([1, 28474]) Input IDs shape: torch.Size([1, 28474]) Labels shape: torch.Size([1, 28474]) Final batch size: 1, sequence length: 23442 Attention mask shape: torch.Size([1, 1, 23442, 23442]) Position ids shape: torch.Size([1, 23442]) Input IDs shape: torch.Size([1, 23442]) Labels shape: torch.Size([1, 23442]) Final batch size: 1, sequence length: 22851 Attention mask shape: torch.Size([1, 1, 22851, 22851]) Position ids shape: torch.Size([1, 22851]) Input IDs shape: torch.Size([1, 22851]) Labels shape: torch.Size([1, 22851]) Final batch size: 1, sequence length: 28012 Attention mask shape: torch.Size([1, 1, 28012, 28012]) Position ids shape: torch.Size([1, 28012]) Input IDs shape: torch.Size([1, 28012]) Labels shape: torch.Size([1, 28012]) Final batch size: 1, sequence length: 27384 Attention mask shape: torch.Size([1, 1, 27384, 27384]) Position ids shape: torch.Size([1, 27384]) Input IDs shape: torch.Size([1, 27384]) Labels shape: torch.Size([1, 27384]) Final batch size: 1, sequence length: 24364 Attention mask shape: torch.Size([1, 1, 24364, 24364]) Position ids shape: torch.Size([1, 24364]) Input IDs shape: torch.Size([1, 24364]) Labels shape: torch.Size([1, 24364]) Final batch size: 1, sequence length: 14137 Attention mask shape: torch.Size([1, 1, 14137, 14137]) Position ids shape: torch.Size([1, 14137]) Input IDs shape: torch.Size([1, 14137]) Labels shape: torch.Size([1, 14137]) Final batch size: 1, sequence length: 12234 Attention mask shape: torch.Size([1, 1, 12234, 12234]) Position ids shape: torch.Size([1, 12234]) Input IDs shape: torch.Size([1, 12234]) Labels shape: torch.Size([1, 12234]) Final batch size: 1, sequence length: 21853 Attention mask shape: torch.Size([1, 1, 21853, 21853]) Position ids shape: torch.Size([1, 21853]) Input IDs shape: torch.Size([1, 21853]) Labels shape: torch.Size([1, 21853]) Final batch size: 1, sequence length: 30235 Attention mask shape: torch.Size([1, 1, 30235, 30235]) Position ids shape: torch.Size([1, 30235]) Input IDs shape: torch.Size([1, 30235]) Labels shape: torch.Size([1, 30235]) Final batch size: 1, sequence length: 20028 Attention mask shape: torch.Size([1, 1, 20028, 20028]) Position ids shape: torch.Size([1, 20028]) Input IDs shape: torch.Size([1, 20028]) Labels shape: torch.Size([1, 20028]) Final batch size: 1, sequence length: 26454 Attention mask shape: torch.Size([1, 1, 26454, 26454]) Position ids shape: torch.Size([1, 26454]) Input IDs shape: torch.Size([1, 26454]) Labels shape: torch.Size([1, 26454]) Final batch size: 1, sequence length: 11662 Attention mask shape: torch.Size([1, 1, 11662, 11662]) Position ids shape: torch.Size([1, 11662]) Input IDs shape: torch.Size([1, 11662]) Labels shape: torch.Size([1, 11662]) Final batch size: 1, sequence length: 26500 Attention mask shape: torch.Size([1, 1, 26500, 26500]) Position ids shape: torch.Size([1, 26500]) Input IDs shape: torch.Size([1, 26500]) Labels shape: torch.Size([1, 26500]) Final batch size: 1, sequence length: 22920 Attention mask shape: torch.Size([1, 1, 22920, 22920]) Position ids shape: torch.Size([1, 22920]) Input IDs shape: torch.Size([1, 22920]) Labels shape: torch.Size([1, 22920]) Final batch size: 1, sequence length: 19027 Attention mask shape: torch.Size([1, 1, 19027, 19027]) Position ids shape: torch.Size([1, 19027]) Input IDs shape: torch.Size([1, 19027]) Labels shape: torch.Size([1, 19027]) Final batch size: 1, sequence length: 24716 Attention mask shape: torch.Size([1, 1, 24716, 24716]) Position ids shape: torch.Size([1, 24716]) Input IDs shape: torch.Size([1, 24716]) Labels shape: torch.Size([1, 24716]) Final batch size: 1, sequence length: 27520 Attention mask shape: torch.Size([1, 1, 27520, 27520]) Position ids shape: torch.Size([1, 27520]) Input IDs shape: torch.Size([1, 27520]) Labels shape: torch.Size([1, 27520]) Final batch size: 1, sequence length: 20314 Attention mask shape: torch.Size([1, 1, 20314, 20314]) Position ids shape: torch.Size([1, 20314]) Input IDs shape: torch.Size([1, 20314]) Labels shape: torch.Size([1, 20314]) Final batch size: 1, sequence length: 24111 Attention mask shape: torch.Size([1, 1, 24111, 24111]) Position ids shape: torch.Size([1, 24111]) Input IDs shape: torch.Size([1, 24111]) Labels shape: torch.Size([1, 24111]) Final batch size: 1, sequence length: 21168 Attention mask shape: torch.Size([1, 1, 21168, 21168]) Position ids shape: torch.Size([1, 21168]) Input IDs shape: torch.Size([1, 21168]) Labels shape: torch.Size([1, 21168]) Final batch size: 1, sequence length: 24923 Attention mask shape: torch.Size([1, 1, 24923, 24923]) Position ids shape: torch.Size([1, 24923]) Input IDs shape: torch.Size([1, 24923]) Labels shape: torch.Size([1, 24923]) Final batch size: 1, sequence length: 23855 Attention mask shape: torch.Size([1, 1, 23855, 23855]) Position ids shape: torch.Size([1, 23855]) Input IDs shape: torch.Size([1, 23855]) Labels shape: torch.Size([1, 23855]) Final batch size: 1, sequence length: 12519 Attention mask shape: torch.Size([1, 1, 12519, 12519]) Position ids shape: torch.Size([1, 12519]) Input IDs shape: torch.Size([1, 12519]) Labels shape: torch.Size([1, 12519]) Final batch size: 1, sequence length: 33179 Attention mask shape: torch.Size([1, 1, 33179, 33179]) Position ids shape: torch.Size([1, 33179]) Input IDs shape: torch.Size([1, 33179]) Labels shape: torch.Size([1, 33179]) Final batch size: 1, sequence length: 29842 Attention mask shape: torch.Size([1, 1, 29842, 29842]) Position ids shape: torch.Size([1, 29842]) Input IDs shape: torch.Size([1, 29842]) Labels shape: torch.Size([1, 29842]) Final batch size: 1, sequence length: 24830 Attention mask shape: torch.Size([1, 1, 24830, 24830]) Position ids shape: torch.Size([1, 24830]) Input IDs shape: torch.Size([1, 24830]) Labels shape: torch.Size([1, 24830]) Final batch size: 1, sequence length: 19302 Attention mask shape: torch.Size([1, 1, 19302, 19302]) Position ids shape: torch.Size([1, 19302]) Input IDs shape: torch.Size([1, 19302]) Labels shape: torch.Size([1, 19302]) Final batch size: 1, sequence length: 23695 Attention mask shape: torch.Size([1, 1, 23695, 23695]) Position ids shape: torch.Size([1, 23695]) Input IDs shape: torch.Size([1, 23695]) Labels shape: torch.Size([1, 23695]) Final batch size: 1, sequence length: 33765 Attention mask shape: torch.Size([1, 1, 33765, 33765]) Position ids shape: torch.Size([1, 33765]) Input IDs shape: torch.Size([1, 33765]) Labels shape: torch.Size([1, 33765]) Final batch size: 1, sequence length: 30041 Final batch size: 1, sequence length: 24187 Attention mask shape: torch.Size([1, 1, 30041, 30041]) Position ids shape: torch.Size([1, 30041]) Input IDs shape: torch.Size([1, 30041]) Labels shape: torch.Size([1, 30041]) Attention mask shape: torch.Size([1, 1, 24187, 24187]) Position ids shape: torch.Size([1, 24187]) Input IDs shape: torch.Size([1, 24187]) Labels shape: torch.Size([1, 24187]) Final batch size: 1, sequence length: 33321 Attention mask shape: torch.Size([1, 1, 33321, 33321]) Position ids shape: torch.Size([1, 33321]) Input IDs shape: torch.Size([1, 33321]) Labels shape: torch.Size([1, 33321]) Final batch size: 1, sequence length: 31404 Attention mask shape: torch.Size([1, 1, 31404, 31404]) Position ids shape: torch.Size([1, 31404]) Input IDs shape: torch.Size([1, 31404]) Labels shape: torch.Size([1, 31404]) Final batch size: 1, sequence length: 29743 Attention mask shape: torch.Size([1, 1, 29743, 29743]) Position ids shape: torch.Size([1, 29743]) Input IDs shape: torch.Size([1, 29743]) Labels shape: torch.Size([1, 29743]) Final batch size: 1, sequence length: 17763 Attention mask shape: torch.Size([1, 1, 17763, 17763]) Position ids shape: torch.Size([1, 17763]) Input IDs shape: torch.Size([1, 17763]) Labels shape: torch.Size([1, 17763]) Final batch size: 1, sequence length: 29385 Attention mask shape: torch.Size([1, 1, 29385, 29385]) Position ids shape: torch.Size([1, 29385]) Input IDs shape: torch.Size([1, 29385]) Labels shape: torch.Size([1, 29385]) Final batch size: 1, sequence length: 35555 Attention mask shape: torch.Size([1, 1, 35555, 35555]) Position ids shape: torch.Size([1, 35555]) Input IDs shape: torch.Size([1, 35555]) Labels shape: torch.Size([1, 35555]) Final batch size: 1, sequence length: 27107 Attention mask shape: torch.Size([1, 1, 27107, 27107]) Position ids shape: torch.Size([1, 27107]) Input IDs shape: torch.Size([1, 27107]) Labels shape: torch.Size([1, 27107]) Final batch size: 1, sequence length: 34169 Attention mask shape: torch.Size([1, 1, 34169, 34169]) Position ids shape: torch.Size([1, 34169]) Input IDs shape: torch.Size([1, 34169]) Labels shape: torch.Size([1, 34169]) Final batch size: 1, sequence length: 27738 Attention mask shape: torch.Size([1, 1, 27738, 27738]) Position ids shape: torch.Size([1, 27738]) Input IDs shape: torch.Size([1, 27738]) Labels shape: torch.Size([1, 27738]) Final batch size: 1, sequence length: 39018 Attention mask shape: torch.Size([1, 1, 39018, 39018]) Position ids shape: torch.Size([1, 39018]) Input IDs shape: torch.Size([1, 39018]) Labels shape: torch.Size([1, 39018]) Final batch size: 1, sequence length: 37657 Attention mask shape: torch.Size([1, 1, 37657, 37657]) Position ids shape: torch.Size([1, 37657]) Input IDs shape: torch.Size([1, 37657]) Labels shape: torch.Size([1, 37657]) Final batch size: 1, sequence length: 38576 Attention mask shape: torch.Size([1, 1, 38576, 38576]) Position ids shape: torch.Size([1, 38576]) Input IDs shape: torch.Size([1, 38576]) Labels shape: torch.Size([1, 38576]) Final batch size: 1, sequence length: 26696 Attention mask shape: torch.Size([1, 1, 26696, 26696]) Position ids shape: torch.Size([1, 26696]) Input IDs shape: torch.Size([1, 26696]) Labels shape: torch.Size([1, 26696]) Final batch size: 1, sequence length: 40006 Attention mask shape: torch.Size([1, 1, 40006, 40006]) Position ids shape: torch.Size([1, 40006]) Input IDs shape: torch.Size([1, 40006]) Labels shape: torch.Size([1, 40006]) Final batch size: 1, sequence length: 10139 Attention mask shape: torch.Size([1, 1, 10139, 10139]) Position ids shape: torch.Size([1, 10139]) Input IDs shape: torch.Size([1, 10139]) Labels shape: torch.Size([1, 10139]) Final batch size: 1, sequence length: 31115 Attention mask shape: torch.Size([1, 1, 31115, 31115]) Position ids shape: torch.Size([1, 31115]) Input IDs shape: torch.Size([1, 31115]) Labels shape: torch.Size([1, 31115]) Final batch size: 1, sequence length: 26685 Attention mask shape: torch.Size([1, 1, 26685, 26685]) Position ids shape: torch.Size([1, 26685]) Input IDs shape: torch.Size([1, 26685]) Labels shape: torch.Size([1, 26685]) Final batch size: 1, sequence length: 29085 Attention mask shape: torch.Size([1, 1, 29085, 29085]) Position ids shape: torch.Size([1, 29085]) Input IDs shape: torch.Size([1, 29085]) Labels shape: torch.Size([1, 29085]) Final batch size: 1, sequence length: 32217 Attention mask shape: torch.Size([1, 1, 32217, 32217]) Position ids shape: torch.Size([1, 32217]) Input IDs shape: torch.Size([1, 32217]) Labels shape: torch.Size([1, 32217]) Final batch size: 1, sequence length: 18734 Attention mask shape: torch.Size([1, 1, 18734, 18734]) Position ids shape: torch.Size([1, 18734]) Input IDs shape: torch.Size([1, 18734]) Labels shape: torch.Size([1, 18734]) Final batch size: 1, sequence length: 29286 Attention mask shape: torch.Size([1, 1, 29286, 29286]) Position ids shape: torch.Size([1, 29286]) Input IDs shape: torch.Size([1, 29286]) Labels shape: torch.Size([1, 29286]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 19138 Attention mask shape: torch.Size([1, 1, 19138, 19138]) Position ids shape: torch.Size([1, 19138]) Input IDs shape: torch.Size([1, 19138]) Labels shape: torch.Size([1, 19138]) Final batch size: 1, sequence length: 35949 Attention mask shape: torch.Size([1, 1, 35949, 35949]) Position ids shape: torch.Size([1, 35949]) Input IDs shape: torch.Size([1, 35949]) Labels shape: torch.Size([1, 35949]) Final batch size: 1, sequence length: 31991 Attention mask shape: torch.Size([1, 1, 31991, 31991]) Position ids shape: torch.Size([1, 31991]) Input IDs shape: torch.Size([1, 31991]) Labels shape: torch.Size([1, 31991]) Final batch size: 1, sequence length: 23848 Attention mask shape: torch.Size([1, 1, 23848, 23848]) Position ids shape: torch.Size([1, 23848]) Input IDs shape: torch.Size([1, 23848]) Labels shape: torch.Size([1, 23848]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 22500 Attention mask shape: torch.Size([1, 1, 22500, 22500]) Position ids shape: torch.Size([1, 22500]) Input IDs shape: torch.Size([1, 22500]) Labels shape: torch.Size([1, 22500]) Final batch size: 1, sequence length: 32762 Attention mask shape: torch.Size([1, 1, 32762, 32762]) Position ids shape: torch.Size([1, 32762]) Input IDs shape: torch.Size([1, 32762]) Labels shape: torch.Size([1, 32762]) Final batch size: 1, sequence length: 25990 Attention mask shape: torch.Size([1, 1, 25990, 25990]) Position ids shape: torch.Size([1, 25990]) Input IDs shape: torch.Size([1, 25990]) Labels shape: torch.Size([1, 25990]) Final batch size: 1, sequence length: 26372 Attention mask shape: torch.Size([1, 1, 26372, 26372]) Position ids shape: torch.Size([1, 26372]) Input IDs shape: torch.Size([1, 26372]) Labels shape: torch.Size([1, 26372]) Final batch size: 1, sequence length: 37680 Attention mask shape: torch.Size([1, 1, 37680, 37680]) Position ids shape: torch.Size([1, 37680]) Input IDs shape: torch.Size([1, 37680]) Labels shape: torch.Size([1, 37680]) Final batch size: 1, sequence length: 34628 Attention mask shape: torch.Size([1, 1, 34628, 34628]) Position ids shape: torch.Size([1, 34628]) Input IDs shape: torch.Size([1, 34628]) Labels shape: torch.Size([1, 34628]) Final batch size: 1, sequence length: 40834 Attention mask shape: torch.Size([1, 1, 40834, 40834]) Position ids shape: torch.Size([1, 40834]) Input IDs shape: torch.Size([1, 40834]) Labels shape: torch.Size([1, 40834]) Final batch size: 1, sequence length: 22778 Attention mask shape: torch.Size([1, 1, 22778, 22778]) Position ids shape: torch.Size([1, 22778]) Input IDs shape: torch.Size([1, 22778]) Labels shape: torch.Size([1, 22778]) Final batch size: 1, sequence length: 32675 Attention mask shape: torch.Size([1, 1, 32675, 32675]) Position ids shape: torch.Size([1, 32675]) Input IDs shape: torch.Size([1, 32675]) Labels shape: torch.Size([1, 32675]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 34791 Attention mask shape: torch.Size([1, 1, 34791, 34791]) Position ids shape: torch.Size([1, 34791]) Input IDs shape: torch.Size([1, 34791]) Labels shape: torch.Size([1, 34791]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) {'loss': 0.246, 'grad_norm': 0.15816424394164325, 'learning_rate': 6.698729810778065e-07, 'num_tokens': -inf, 'epoch': 6.88} Final batch size: 1, sequence length: 6986 Attention mask shape: torch.Size([1, 1, 6986, 6986]) Position ids shape: torch.Size([1, 6986]) Input IDs shape: torch.Size([1, 6986]) Labels shape: torch.Size([1, 6986]) Final batch size: 1, sequence length: 5686 Attention mask shape: torch.Size([1, 1, 5686, 5686]) Position ids shape: torch.Size([1, 5686]) Input IDs shape: torch.Size([1, 5686]) Labels shape: torch.Size([1, 5686]) Final batch size: 1, sequence length: 11304 Attention mask shape: torch.Size([1, 1, 11304, 11304]) Position ids shape: torch.Size([1, 11304]) Input IDs shape: torch.Size([1, 11304]) Labels shape: torch.Size([1, 11304]) Final batch size: 1, sequence length: 13224 Attention mask shape: torch.Size([1, 1, 13224, 13224]) Position ids shape: torch.Size([1, 13224]) Input IDs shape: torch.Size([1, 13224]) Labels shape: torch.Size([1, 13224]) Final batch size: 1, sequence length: 14168 Attention mask shape: torch.Size([1, 1, 14168, 14168]) Position ids shape: torch.Size([1, 14168]) Input IDs shape: torch.Size([1, 14168]) Labels shape: torch.Size([1, 14168]) Final batch size: 1, sequence length: 14304 Attention mask shape: torch.Size([1, 1, 14304, 14304]) Position ids shape: torch.Size([1, 14304]) Input IDs shape: torch.Size([1, 14304]) Labels shape: torch.Size([1, 14304]) Final batch size: 1, sequence length: 15621 Attention mask shape: torch.Size([1, 1, 15621, 15621]) Position ids shape: torch.Size([1, 15621]) Input IDs shape: torch.Size([1, 15621]) Labels shape: torch.Size([1, 15621]) Final batch size: 1, sequence length: 12847 Attention mask shape: torch.Size([1, 1, 12847, 12847]) Position ids shape: torch.Size([1, 12847]) Input IDs shape: torch.Size([1, 12847]) Labels shape: torch.Size([1, 12847]) Final batch size: 1, sequence length: 16514 Attention mask shape: torch.Size([1, 1, 16514, 16514]) Position ids shape: torch.Size([1, 16514]) Input IDs shape: torch.Size([1, 16514]) Labels shape: torch.Size([1, 16514]) Final batch size: 1, sequence length: 16440 Attention mask shape: torch.Size([1, 1, 16440, 16440]) Position ids shape: torch.Size([1, 16440]) Input IDs shape: torch.Size([1, 16440]) Labels shape: torch.Size([1, 16440]) Final batch size: 1, sequence length: 18246 Attention mask shape: torch.Size([1, 1, 18246, 18246]) Position ids shape: torch.Size([1, 18246]) Input IDs shape: torch.Size([1, 18246]) Labels shape: torch.Size([1, 18246]) Final batch size: 1, sequence length: 17852 Attention mask shape: torch.Size([1, 1, 17852, 17852]) Position ids shape: torch.Size([1, 17852]) Input IDs shape: torch.Size([1, 17852]) Labels shape: torch.Size([1, 17852]) Final batch size: 1, sequence length: 17213 Attention mask shape: torch.Size([1, 1, 17213, 17213]) Position ids shape: torch.Size([1, 17213]) Input IDs shape: torch.Size([1, 17213]) Labels shape: torch.Size([1, 17213]) Final batch size: 1, sequence length: 18044 Attention mask shape: torch.Size([1, 1, 18044, 18044]) Position ids shape: torch.Size([1, 18044]) Input IDs shape: torch.Size([1, 18044]) Labels shape: torch.Size([1, 18044]) Final batch size: 1, sequence length: 16501 Attention mask shape: torch.Size([1, 1, 16501, 16501]) Position ids shape: torch.Size([1, 16501]) Input IDs shape: torch.Size([1, 16501]) Labels shape: torch.Size([1, 16501]) Final batch size: 1, sequence length: 13872 Attention mask shape: torch.Size([1, 1, 13872, 13872]) Position ids shape: torch.Size([1, 13872]) Input IDs shape: torch.Size([1, 13872]) Labels shape: torch.Size([1, 13872]) Final batch size: 1, sequence length: 17773 Attention mask shape: torch.Size([1, 1, 17773, 17773]) Position ids shape: torch.Size([1, 17773]) Input IDs shape: torch.Size([1, 17773]) Labels shape: torch.Size([1, 17773]) Final batch size: 1, sequence length: 16378 Attention mask shape: torch.Size([1, 1, 16378, 16378]) Position ids shape: torch.Size([1, 16378]) Input IDs shape: torch.Size([1, 16378]) Labels shape: torch.Size([1, 16378]) Final batch size: 1, sequence length: 18155 Attention mask shape: torch.Size([1, 1, 18155, 18155]) Final batch size: 1, sequence length: 15652 Position ids shape: torch.Size([1, 18155]) Input IDs shape: torch.Size([1, 18155]) Labels shape: torch.Size([1, 18155]) Attention mask shape: torch.Size([1, 1, 15652, 15652]) Position ids shape: torch.Size([1, 15652]) Input IDs shape: torch.Size([1, 15652]) Labels shape: torch.Size([1, 15652]) Final batch size: 1, sequence length: 18289 Attention mask shape: torch.Size([1, 1, 18289, 18289]) Position ids shape: torch.Size([1, 18289]) Input IDs shape: torch.Size([1, 18289]) Labels shape: torch.Size([1, 18289]) Final batch size: 1, sequence length: 21207 Attention mask shape: torch.Size([1, 1, 21207, 21207]) Position ids shape: torch.Size([1, 21207]) Input IDs shape: torch.Size([1, 21207]) Labels shape: torch.Size([1, 21207]) Final batch size: 1, sequence length: 22152 Attention mask shape: torch.Size([1, 1, 22152, 22152]) Position ids shape: torch.Size([1, 22152]) Input IDs shape: torch.Size([1, 22152]) Labels shape: torch.Size([1, 22152]) Final batch size: 1, sequence length: 21163 Attention mask shape: torch.Size([1, 1, 21163, 21163]) Position ids shape: torch.Size([1, 21163]) Input IDs shape: torch.Size([1, 21163]) Labels shape: torch.Size([1, 21163]) Final batch size: 1, sequence length: 20813 Attention mask shape: torch.Size([1, 1, 20813, 20813]) Position ids shape: torch.Size([1, 20813]) Input IDs shape: torch.Size([1, 20813]) Labels shape: torch.Size([1, 20813]) Final batch size: 1, sequence length: 22784 Attention mask shape: torch.Size([1, 1, 22784, 22784]) Position ids shape: torch.Size([1, 22784]) Input IDs shape: torch.Size([1, 22784]) Labels shape: torch.Size([1, 22784]) Final batch size: 1, sequence length: 21316 Attention mask shape: torch.Size([1, 1, 21316, 21316]) Position ids shape: torch.Size([1, 21316]) Input IDs shape: torch.Size([1, 21316]) Labels shape: torch.Size([1, 21316]) Final batch size: 1, sequence length: 22117 Attention mask shape: torch.Size([1, 1, 22117, 22117]) Position ids shape: torch.Size([1, 22117]) Input IDs shape: torch.Size([1, 22117]) Labels shape: torch.Size([1, 22117]) Final batch size: 1, sequence length: 20176 Attention mask shape: torch.Size([1, 1, 20176, 20176]) Position ids shape: torch.Size([1, 20176]) Input IDs shape: torch.Size([1, 20176]) Labels shape: torch.Size([1, 20176]) Final batch size: 1, sequence length: 20766 Attention mask shape: torch.Size([1, 1, 20766, 20766]) Position ids shape: torch.Size([1, 20766]) Input IDs shape: torch.Size([1, 20766]) Labels shape: torch.Size([1, 20766]) Final batch size: 1, sequence length: 21562 Attention mask shape: torch.Size([1, 1, 21562, 21562]) Position ids shape: torch.Size([1, 21562]) Input IDs shape: torch.Size([1, 21562]) Labels shape: torch.Size([1, 21562]) Final batch size: 1, sequence length: 21705 Attention mask shape: torch.Size([1, 1, 21705, 21705]) Position ids shape: torch.Size([1, 21705]) Input IDs shape: torch.Size([1, 21705]) Labels shape: torch.Size([1, 21705]) Final batch size: 1, sequence length: 25880 Attention mask shape: torch.Size([1, 1, 25880, 25880]) Position ids shape: torch.Size([1, 25880]) Input IDs shape: torch.Size([1, 25880]) Labels shape: torch.Size([1, 25880]) Final batch size: 1, sequence length: 22197 Attention mask shape: torch.Size([1, 1, 22197, 22197]) Position ids shape: torch.Size([1, 22197]) Input IDs shape: torch.Size([1, 22197]) Labels shape: torch.Size([1, 22197]) Final batch size: 1, sequence length: 26494 Attention mask shape: torch.Size([1, 1, 26494, 26494]) Position ids shape: torch.Size([1, 26494]) Input IDs shape: torch.Size([1, 26494]) Labels shape: torch.Size([1, 26494]) Final batch size: 1, sequence length: 21607 Attention mask shape: torch.Size([1, 1, 21607, 21607]) Position ids shape: torch.Size([1, 21607]) Input IDs shape: torch.Size([1, 21607]) Labels shape: torch.Size([1, 21607]) Final batch size: 1, sequence length: 25053 Attention mask shape: torch.Size([1, 1, 25053, 25053]) Position ids shape: torch.Size([1, 25053]) Input IDs shape: torch.Size([1, 25053]) Labels shape: torch.Size([1, 25053]) Final batch size: 1, sequence length: 26921 Attention mask shape: torch.Size([1, 1, 26921, 26921]) Position ids shape: torch.Size([1, 26921]) Input IDs shape: torch.Size([1, 26921]) Labels shape: torch.Size([1, 26921]) Final batch size: 1, sequence length: 26303 Attention mask shape: torch.Size([1, 1, 26303, 26303]) Position ids shape: torch.Size([1, 26303]) Input IDs shape: torch.Size([1, 26303]) Labels shape: torch.Size([1, 26303]) Final batch size: 1, sequence length: 28523 Attention mask shape: torch.Size([1, 1, 28523, 28523]) Position ids shape: torch.Size([1, 28523]) Input IDs shape: torch.Size([1, 28523]) Labels shape: torch.Size([1, 28523]) Final batch size: 1, sequence length: 27005 Attention mask shape: torch.Size([1, 1, 27005, 27005]) Position ids shape: torch.Size([1, 27005]) Input IDs shape: torch.Size([1, 27005]) Labels shape: torch.Size([1, 27005]) Final batch size: 1, sequence length: 30151 Attention mask shape: torch.Size([1, 1, 30151, 30151]) Position ids shape: torch.Size([1, 30151]) Input IDs shape: torch.Size([1, 30151]) Labels shape: torch.Size([1, 30151]) Final batch size: 1, sequence length: 30592 Attention mask shape: torch.Size([1, 1, 30592, 30592]) Position ids shape: torch.Size([1, 30592]) Input IDs shape: torch.Size([1, 30592]) Labels shape: torch.Size([1, 30592]) Final batch size: 1, sequence length: 32596 Attention mask shape: torch.Size([1, 1, 32596, 32596]) Position ids shape: torch.Size([1, 32596]) Input IDs shape: torch.Size([1, 32596]) Labels shape: torch.Size([1, 32596]) Final batch size: 1, sequence length: 32473 Attention mask shape: torch.Size([1, 1, 32473, 32473]) Position ids shape: torch.Size([1, 32473]) Input IDs shape: torch.Size([1, 32473]) Labels shape: torch.Size([1, 32473]) Final batch size: 1, sequence length: 33482 Attention mask shape: torch.Size([1, 1, 33482, 33482]) Position ids shape: torch.Size([1, 33482]) Input IDs shape: torch.Size([1, 33482]) Labels shape: torch.Size([1, 33482]) Final batch size: 1, sequence length: 28897 Attention mask shape: torch.Size([1, 1, 28897, 28897]) Position ids shape: torch.Size([1, 28897]) Input IDs shape: torch.Size([1, 28897]) Labels shape: torch.Size([1, 28897]) Final batch size: 1, sequence length: 35485 Attention mask shape: torch.Size([1, 1, 35485, 35485]) Position ids shape: torch.Size([1, 35485]) Input IDs shape: torch.Size([1, 35485]) Labels shape: torch.Size([1, 35485]) Final batch size: 1, sequence length: 35030 Attention mask shape: torch.Size([1, 1, 35030, 35030]) Position ids shape: torch.Size([1, 35030]) Input IDs shape: torch.Size([1, 35030]) Labels shape: torch.Size([1, 35030]) Final batch size: 1, sequence length: 35866 Attention mask shape: torch.Size([1, 1, 35866, 35866]) Position ids shape: torch.Size([1, 35866]) Input IDs shape: torch.Size([1, 35866]) Labels shape: torch.Size([1, 35866]) Final batch size: 1, sequence length: 35100 Attention mask shape: torch.Size([1, 1, 35100, 35100]) Position ids shape: torch.Size([1, 35100]) Input IDs shape: torch.Size([1, 35100]) Labels shape: torch.Size([1, 35100]) Final batch size: 1, sequence length: 35523 Attention mask shape: torch.Size([1, 1, 35523, 35523]) Position ids shape: torch.Size([1, 35523]) Input IDs shape: torch.Size([1, 35523]) Labels shape: torch.Size([1, 35523]) Final batch size: 1, sequence length: 34469 Attention mask shape: torch.Size([1, 1, 34469, 34469]) Position ids shape: torch.Size([1, 34469]) Input IDs shape: torch.Size([1, 34469]) Labels shape: torch.Size([1, 34469]) Final batch size: 1, sequence length: 37980 Attention mask shape: torch.Size([1, 1, 37980, 37980]) Position ids shape: torch.Size([1, 37980]) Input IDs shape: torch.Size([1, 37980]) Labels shape: torch.Size([1, 37980]) Final batch size: 1, sequence length: 37040 Attention mask shape: torch.Size([1, 1, 37040, 37040]) Position ids shape: torch.Size([1, 37040]) Input IDs shape: torch.Size([1, 37040]) Labels shape: torch.Size([1, 37040]) Final batch size: 1, sequence length: 40526 Attention mask shape: torch.Size([1, 1, 40526, 40526]) Position ids shape: torch.Size([1, 40526]) Input IDs shape: torch.Size([1, 40526]) Labels shape: torch.Size([1, 40526]) Final batch size: 1, sequence length: 39589 Attention mask shape: torch.Size([1, 1, 39589, 39589]) Position ids shape: torch.Size([1, 39589]) Input IDs shape: torch.Size([1, 39589]) Labels shape: torch.Size([1, 39589]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 38927 Attention mask shape: torch.Size([1, 1, 38927, 38927]) Position ids shape: torch.Size([1, 38927]) Input IDs shape: torch.Size([1, 38927]) Labels shape: torch.Size([1, 38927]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) {'loss': 0.2385, 'grad_norm': 0.16268755212410746, 'learning_rate': 5.449673790581611e-07, 'num_tokens': -inf, 'epoch': 7.0} Final batch size: 1, sequence length: 5686 Attention mask shape: torch.Size([1, 1, 5686, 5686]) Position ids shape: torch.Size([1, 5686]) Input IDs shape: torch.Size([1, 5686]) Labels shape: torch.Size([1, 5686]) Final batch size: 1, sequence length: 6362 Attention mask shape: torch.Size([1, 1, 6362, 6362]) Position ids shape: torch.Size([1, 6362]) Input IDs shape: torch.Size([1, 6362]) Labels shape: torch.Size([1, 6362]) Final batch size: 1, sequence length: 6986 Attention mask shape: torch.Size([1, 1, 6986, 6986]) Position ids shape: torch.Size([1, 6986]) Input IDs shape: torch.Size([1, 6986]) Labels shape: torch.Size([1, 6986]) Final batch size: 1, sequence length: 10523 Attention mask shape: torch.Size([1, 1, 10523, 10523]) Position ids shape: torch.Size([1, 10523]) Input IDs shape: torch.Size([1, 10523]) Labels shape: torch.Size([1, 10523]) Final batch size: 1, sequence length: 11304 Attention mask shape: torch.Size([1, 1, 11304, 11304]) Position ids shape: torch.Size([1, 11304]) Input IDs shape: torch.Size([1, 11304]) Labels shape: torch.Size([1, 11304]) Final batch size: 1, sequence length: 12847 Attention mask shape: torch.Size([1, 1, 12847, 12847]) Position ids shape: torch.Size([1, 12847]) Input IDs shape: torch.Size([1, 12847]) Labels shape: torch.Size([1, 12847]) Final batch size: 1, sequence length: 14168 Attention mask shape: torch.Size([1, 1, 14168, 14168]) Position ids shape: torch.Size([1, 14168]) Input IDs shape: torch.Size([1, 14168]) Labels shape: torch.Size([1, 14168]) Final batch size: 1, sequence length: 14304 Attention mask shape: torch.Size([1, 1, 14304, 14304]) Position ids shape: torch.Size([1, 14304]) Input IDs shape: torch.Size([1, 14304]) Labels shape: torch.Size([1, 14304]) Final batch size: 1, sequence length: 13224 Attention mask shape: torch.Size([1, 1, 13224, 13224]) Position ids shape: torch.Size([1, 13224]) Input IDs shape: torch.Size([1, 13224]) Labels shape: torch.Size([1, 13224]) Final batch size: 1, sequence length: 15652 Attention mask shape: torch.Size([1, 1, 15652, 15652]) Position ids shape: torch.Size([1, 15652]) Input IDs shape: torch.Size([1, 15652]) Labels shape: torch.Size([1, 15652]) Final batch size: 1, sequence length: 16378 Attention mask shape: torch.Size([1, 1, 16378, 16378]) Position ids shape: torch.Size([1, 16378]) Input IDs shape: torch.Size([1, 16378]) Labels shape: torch.Size([1, 16378]) Final batch size: 1, sequence length: 13872 Attention mask shape: torch.Size([1, 1, 13872, 13872]) Position ids shape: torch.Size([1, 13872]) Input IDs shape: torch.Size([1, 13872]) Labels shape: torch.Size([1, 13872]) Final batch size: 1, sequence length: 15621 Attention mask shape: torch.Size([1, 1, 15621, 15621]) Position ids shape: torch.Size([1, 15621]) Input IDs shape: torch.Size([1, 15621]) Labels shape: torch.Size([1, 15621]) Final batch size: 1, sequence length: 16501 Attention mask shape: torch.Size([1, 1, 16501, 16501]) Position ids shape: torch.Size([1, 16501]) Input IDs shape: torch.Size([1, 16501]) Labels shape: torch.Size([1, 16501]) Final batch size: 1, sequence length: 17213 Attention mask shape: torch.Size([1, 1, 17213, 17213]) Position ids shape: torch.Size([1, 17213]) Input IDs shape: torch.Size([1, 17213]) Labels shape: torch.Size([1, 17213]) Final batch size: 1, sequence length: 16514 Attention mask shape: torch.Size([1, 1, 16514, 16514]) Position ids shape: torch.Size([1, 16514]) Input IDs shape: torch.Size([1, 16514]) Labels shape: torch.Size([1, 16514]) Final batch size: 1, sequence length: 18044 Attention mask shape: torch.Size([1, 1, 18044, 18044]) Position ids shape: torch.Size([1, 18044]) Input IDs shape: torch.Size([1, 18044]) Labels shape: torch.Size([1, 18044]) Final batch size: 1, sequence length: 17852 Attention mask shape: torch.Size([1, 1, 17852, 17852]) Position ids shape: torch.Size([1, 17852]) Input IDs shape: torch.Size([1, 17852]) Labels shape: torch.Size([1, 17852]) Final batch size: 1, sequence length: 18246 Attention mask shape: torch.Size([1, 1, 18246, 18246]) Position ids shape: torch.Size([1, 18246]) Input IDs shape: torch.Size([1, 18246]) Labels shape: torch.Size([1, 18246]) Final batch size: 1, sequence length: 18289 Attention mask shape: torch.Size([1, 1, 18289, 18289]) Position ids shape: torch.Size([1, 18289]) Input IDs shape: torch.Size([1, 18289]) Labels shape: torch.Size([1, 18289]) Final batch size: 1, sequence length: 20176 Attention mask shape: torch.Size([1, 1, 20176, 20176]) Position ids shape: torch.Size([1, 20176]) Input IDs shape: torch.Size([1, 20176]) Labels shape: torch.Size([1, 20176]) Final batch size: 1, sequence length: 18155 Attention mask shape: torch.Size([1, 1, 18155, 18155]) Position ids shape: torch.Size([1, 18155]) Input IDs shape: torch.Size([1, 18155]) Labels shape: torch.Size([1, 18155]) Final batch size: 1, sequence length: 20813 Attention mask shape: torch.Size([1, 1, 20813, 20813]) Position ids shape: torch.Size([1, 20813]) Input IDs shape: torch.Size([1, 20813]) Labels shape: torch.Size([1, 20813]) Final batch size: 1, sequence length: 21607 Attention mask shape: torch.Size([1, 1, 21607, 21607]) Position ids shape: torch.Size([1, 21607]) Input IDs shape: torch.Size([1, 21607]) Labels shape: torch.Size([1, 21607]) Final batch size: 1, sequence length: 5911 Attention mask shape: torch.Size([1, 1, 5911, 5911]) Position ids shape: torch.Size([1, 5911]) Input IDs shape: torch.Size([1, 5911]) Labels shape: torch.Size([1, 5911]) Final batch size: 1, sequence length: 21962 Attention mask shape: torch.Size([1, 1, 21962, 21962]) Position ids shape: torch.Size([1, 21962]) Input IDs shape: torch.Size([1, 21962]) Labels shape: torch.Size([1, 21962]) Final batch size: 1, sequence length: 21163 Attention mask shape: torch.Size([1, 1, 21163, 21163]) Position ids shape: torch.Size([1, 21163]) Input IDs shape: torch.Size([1, 21163]) Labels shape: torch.Size([1, 21163]) Final batch size: 1, sequence length: 21562 Attention mask shape: torch.Size([1, 1, 21562, 21562]) Position ids shape: torch.Size([1, 21562]) Input IDs shape: torch.Size([1, 21562]) Labels shape: torch.Size([1, 21562]) Final batch size: 1, sequence length: 17773 Attention mask shape: torch.Size([1, 1, 17773, 17773]) Position ids shape: torch.Size([1, 17773]) Input IDs shape: torch.Size([1, 17773]) Labels shape: torch.Size([1, 17773]) Final batch size: 1, sequence length: 21705 Attention mask shape: torch.Size([1, 1, 21705, 21705]) Position ids shape: torch.Size([1, 21705]) Input IDs shape: torch.Size([1, 21705]) Labels shape: torch.Size([1, 21705]) Final batch size: 1, sequence length: 21316 Attention mask shape: torch.Size([1, 1, 21316, 21316]) Position ids shape: torch.Size([1, 21316]) Input IDs shape: torch.Size([1, 21316]) Labels shape: torch.Size([1, 21316]) Final batch size: 1, sequence length: 16440 Attention mask shape: torch.Size([1, 1, 16440, 16440]) Position ids shape: torch.Size([1, 16440]) Input IDs shape: torch.Size([1, 16440]) Labels shape: torch.Size([1, 16440]) Final batch size: 1, sequence length: 22197 Attention mask shape: torch.Size([1, 1, 22197, 22197]) Position ids shape: torch.Size([1, 22197]) Input IDs shape: torch.Size([1, 22197]) Labels shape: torch.Size([1, 22197]) Final batch size: 1, sequence length: 20766 Attention mask shape: torch.Size([1, 1, 20766, 20766]) Position ids shape: torch.Size([1, 20766]) Input IDs shape: torch.Size([1, 20766]) Labels shape: torch.Size([1, 20766]) Final batch size: 1, sequence length: 22152 Attention mask shape: torch.Size([1, 1, 22152, 22152]) Position ids shape: torch.Size([1, 22152]) Input IDs shape: torch.Size([1, 22152]) Labels shape: torch.Size([1, 22152]) Final batch size: 1, sequence length: 18545 Attention mask shape: torch.Size([1, 1, 18545, 18545]) Position ids shape: torch.Size([1, 18545]) Input IDs shape: torch.Size([1, 18545]) Labels shape: torch.Size([1, 18545]) Final batch size: 1, sequence length: 21207 Attention mask shape: torch.Size([1, 1, 21207, 21207]) Position ids shape: torch.Size([1, 21207]) Input IDs shape: torch.Size([1, 21207]) Labels shape: torch.Size([1, 21207]) Final batch size: 1, sequence length: 22784 Attention mask shape: torch.Size([1, 1, 22784, 22784]) Position ids shape: torch.Size([1, 22784]) Input IDs shape: torch.Size([1, 22784]) Labels shape: torch.Size([1, 22784]) Final batch size: 1, sequence length: 25053 Attention mask shape: torch.Size([1, 1, 25053, 25053]) Position ids shape: torch.Size([1, 25053]) Input IDs shape: torch.Size([1, 25053]) Labels shape: torch.Size([1, 25053]) Final batch size: 1, sequence length: 17910 Attention mask shape: torch.Size([1, 1, 17910, 17910]) Position ids shape: torch.Size([1, 17910]) Input IDs shape: torch.Size([1, 17910]) Labels shape: torch.Size([1, 17910]) Final batch size: 1, sequence length: 12421 Attention mask shape: torch.Size([1, 1, 12421, 12421]) Position ids shape: torch.Size([1, 12421]) Input IDs shape: torch.Size([1, 12421]) Labels shape: torch.Size([1, 12421]) Final batch size: 1, sequence length: 24365 Attention mask shape: torch.Size([1, 1, 24365, 24365]) Position ids shape: torch.Size([1, 24365]) Input IDs shape: torch.Size([1, 24365]) Labels shape: torch.Size([1, 24365]) Final batch size: 1, sequence length: 22117 Attention mask shape: torch.Size([1, 1, 22117, 22117]) Position ids shape: torch.Size([1, 22117]) Input IDs shape: torch.Size([1, 22117]) Labels shape: torch.Size([1, 22117]) Final batch size: 1, sequence length: 19962 Attention mask shape: torch.Size([1, 1, 19962, 19962]) Position ids shape: torch.Size([1, 19962]) Input IDs shape: torch.Size([1, 19962]) Labels shape: torch.Size([1, 19962]) Final batch size: 1, sequence length: 14496 Attention mask shape: torch.Size([1, 1, 14496, 14496]) Position ids shape: torch.Size([1, 14496]) Input IDs shape: torch.Size([1, 14496]) Labels shape: torch.Size([1, 14496]) Final batch size: 1, sequence length: 26494 Attention mask shape: torch.Size([1, 1, 26494, 26494]) Position ids shape: torch.Size([1, 26494]) Input IDs shape: torch.Size([1, 26494]) Labels shape: torch.Size([1, 26494]) Final batch size: 1, sequence length: 26303 Attention mask shape: torch.Size([1, 1, 26303, 26303]) Position ids shape: torch.Size([1, 26303]) Input IDs shape: torch.Size([1, 26303]) Labels shape: torch.Size([1, 26303]) Final batch size: 1, sequence length: 26921 Attention mask shape: torch.Size([1, 1, 26921, 26921]) Position ids shape: torch.Size([1, 26921]) Input IDs shape: torch.Size([1, 26921]) Labels shape: torch.Size([1, 26921]) Final batch size: 1, sequence length: 20101 Attention mask shape: torch.Size([1, 1, 20101, 20101]) Position ids shape: torch.Size([1, 20101]) Input IDs shape: torch.Size([1, 20101]) Labels shape: torch.Size([1, 20101]) Final batch size: 1, sequence length: 25880 Attention mask shape: torch.Size([1, 1, 25880, 25880]) Position ids shape: torch.Size([1, 25880]) Input IDs shape: torch.Size([1, 25880]) Labels shape: torch.Size([1, 25880]) Final batch size: 1, sequence length: 27633 Attention mask shape: torch.Size([1, 1, 27633, 27633]) Position ids shape: torch.Size([1, 27633]) Input IDs shape: torch.Size([1, 27633]) Labels shape: torch.Size([1, 27633]) Final batch size: 1, sequence length: 13031 Attention mask shape: torch.Size([1, 1, 13031, 13031]) Position ids shape: torch.Size([1, 13031]) Input IDs shape: torch.Size([1, 13031]) Labels shape: torch.Size([1, 13031]) Final batch size: 1, sequence length: 23338 Attention mask shape: torch.Size([1, 1, 23338, 23338]) Position ids shape: torch.Size([1, 23338]) Input IDs shape: torch.Size([1, 23338]) Labels shape: torch.Size([1, 23338]) Final batch size: 1, sequence length: 28523 Attention mask shape: torch.Size([1, 1, 28523, 28523]) Position ids shape: torch.Size([1, 28523]) Input IDs shape: torch.Size([1, 28523]) Labels shape: torch.Size([1, 28523]) Final batch size: 1, sequence length: 11515 Attention mask shape: torch.Size([1, 1, 11515, 11515]) Position ids shape: torch.Size([1, 11515]) Input IDs shape: torch.Size([1, 11515]) Labels shape: torch.Size([1, 11515]) Final batch size: 1, sequence length: 29404 Attention mask shape: torch.Size([1, 1, 29404, 29404]) Position ids shape: torch.Size([1, 29404]) Input IDs shape: torch.Size([1, 29404]) Labels shape: torch.Size([1, 29404]) Final batch size: 1, sequence length: 23975 Attention mask shape: torch.Size([1, 1, 23975, 23975]) Position ids shape: torch.Size([1, 23975]) Input IDs shape: torch.Size([1, 23975]) Labels shape: torch.Size([1, 23975]) Final batch size: 1, sequence length: 30592 Attention mask shape: torch.Size([1, 1, 30592, 30592]) Position ids shape: torch.Size([1, 30592]) Input IDs shape: torch.Size([1, 30592]) Labels shape: torch.Size([1, 30592]) Final batch size: 1, sequence length: 16057 Attention mask shape: torch.Size([1, 1, 16057, 16057]) Position ids shape: torch.Size([1, 16057]) Input IDs shape: torch.Size([1, 16057]) Labels shape: torch.Size([1, 16057]) Final batch size: 1, sequence length: 26138 Attention mask shape: torch.Size([1, 1, 26138, 26138]) Position ids shape: torch.Size([1, 26138]) Input IDs shape: torch.Size([1, 26138]) Labels shape: torch.Size([1, 26138]) Final batch size: 1, sequence length: 18470 Attention mask shape: torch.Size([1, 1, 18470, 18470]) Position ids shape: torch.Size([1, 18470]) Input IDs shape: torch.Size([1, 18470]) Labels shape: torch.Size([1, 18470]) Final batch size: 1, sequence length: 12224 Attention mask shape: torch.Size([1, 1, 12224, 12224]) Position ids shape: torch.Size([1, 12224]) Input IDs shape: torch.Size([1, 12224]) Labels shape: torch.Size([1, 12224]) Final batch size: 1, sequence length: 21672 Attention mask shape: torch.Size([1, 1, 21672, 21672]) Position ids shape: torch.Size([1, 21672]) Input IDs shape: torch.Size([1, 21672]) Labels shape: torch.Size([1, 21672]) Final batch size: 1, sequence length: 20184 Attention mask shape: torch.Size([1, 1, 20184, 20184]) Position ids shape: torch.Size([1, 20184]) Input IDs shape: torch.Size([1, 20184]) Labels shape: torch.Size([1, 20184]) Final batch size: 1, sequence length: 21766 Attention mask shape: torch.Size([1, 1, 21766, 21766]) Position ids shape: torch.Size([1, 21766]) Input IDs shape: torch.Size([1, 21766]) Labels shape: torch.Size([1, 21766]) Final batch size: 1, sequence length: 9704 Attention mask shape: torch.Size([1, 1, 9704, 9704]) Position ids shape: torch.Size([1, 9704]) Input IDs shape: torch.Size([1, 9704]) Labels shape: torch.Size([1, 9704]) Final batch size: 1, sequence length: 33482 Attention mask shape: torch.Size([1, 1, 33482, 33482]) Position ids shape: torch.Size([1, 33482]) Input IDs shape: torch.Size([1, 33482]) Labels shape: torch.Size([1, 33482]) Final batch size: 1, sequence length: 32473 Attention mask shape: torch.Size([1, 1, 32473, 32473]) Position ids shape: torch.Size([1, 32473]) Input IDs shape: torch.Size([1, 32473]) Labels shape: torch.Size([1, 32473]) Final batch size: 1, sequence length: 23558 Attention mask shape: torch.Size([1, 1, 23558, 23558]) Position ids shape: torch.Size([1, 23558]) Input IDs shape: torch.Size([1, 23558]) Labels shape: torch.Size([1, 23558]) Final batch size: 1, sequence length: 21581 Attention mask shape: torch.Size([1, 1, 21581, 21581]) Position ids shape: torch.Size([1, 21581]) Input IDs shape: torch.Size([1, 21581]) Labels shape: torch.Size([1, 21581]) Final batch size: 1, sequence length: 28897 Attention mask shape: torch.Size([1, 1, 28897, 28897]) Position ids shape: torch.Size([1, 28897]) Input IDs shape: torch.Size([1, 28897]) Labels shape: torch.Size([1, 28897]) Final batch size: 1, sequence length: 32596 Attention mask shape: torch.Size([1, 1, 32596, 32596]) Position ids shape: torch.Size([1, 32596]) Input IDs shape: torch.Size([1, 32596]) Labels shape: torch.Size([1, 32596]) Final batch size: 1, sequence length: 11608 Attention mask shape: torch.Size([1, 1, 11608, 11608]) Position ids shape: torch.Size([1, 11608]) Input IDs shape: torch.Size([1, 11608]) Labels shape: torch.Size([1, 11608]) Final batch size: 1, sequence length: 20198 Attention mask shape: torch.Size([1, 1, 20198, 20198]) Position ids shape: torch.Size([1, 20198]) Input IDs shape: torch.Size([1, 20198]) Labels shape: torch.Size([1, 20198]) Final batch size: 1, sequence length: 17456 Attention mask shape: torch.Size([1, 1, 17456, 17456]) Position ids shape: torch.Size([1, 17456]) Input IDs shape: torch.Size([1, 17456]) Labels shape: torch.Size([1, 17456]) Final batch size: 1, sequence length: 35485 Attention mask shape: torch.Size([1, 1, 35485, 35485]) Position ids shape: torch.Size([1, 35485]) Input IDs shape: torch.Size([1, 35485]) Labels shape: torch.Size([1, 35485]) Final batch size: 1, sequence length: 34469 Attention mask shape: torch.Size([1, 1, 34469, 34469]) Position ids shape: torch.Size([1, 34469]) Input IDs shape: torch.Size([1, 34469]) Labels shape: torch.Size([1, 34469]) Final batch size: 1, sequence length: 16257 Attention mask shape: torch.Size([1, 1, 16257, 16257]) Position ids shape: torch.Size([1, 16257]) Input IDs shape: torch.Size([1, 16257]) Labels shape: torch.Size([1, 16257]) Final batch size: 1, sequence length: 29481 Attention mask shape: torch.Size([1, 1, 29481, 29481]) Position ids shape: torch.Size([1, 29481]) Input IDs shape: torch.Size([1, 29481]) Labels shape: torch.Size([1, 29481]) Final batch size: 1, sequence length: 30965 Attention mask shape: torch.Size([1, 1, 30965, 30965]) Position ids shape: torch.Size([1, 30965]) Input IDs shape: torch.Size([1, 30965]) Labels shape: torch.Size([1, 30965]) Final batch size: 1, sequence length: 35100 Attention mask shape: torch.Size([1, 1, 35100, 35100]) Position ids shape: torch.Size([1, 35100]) Input IDs shape: torch.Size([1, 35100]) Labels shape: torch.Size([1, 35100]) Final batch size: 1, sequence length: 24121 Attention mask shape: torch.Size([1, 1, 24121, 24121]) Position ids shape: torch.Size([1, 24121]) Input IDs shape: torch.Size([1, 24121]) Labels shape: torch.Size([1, 24121]) Final batch size: 1, sequence length: 18126 Attention mask shape: torch.Size([1, 1, 18126, 18126]) Position ids shape: torch.Size([1, 18126]) Input IDs shape: torch.Size([1, 18126]) Labels shape: torch.Size([1, 18126]) Final batch size: 1, sequence length: 37040 Attention mask shape: torch.Size([1, 1, 37040, 37040]) Position ids shape: torch.Size([1, 37040]) Input IDs shape: torch.Size([1, 37040]) Labels shape: torch.Size([1, 37040]) Final batch size: 1, sequence length: 11403 Attention mask shape: torch.Size([1, 1, 11403, 11403]) Position ids shape: torch.Size([1, 11403]) Input IDs shape: torch.Size([1, 11403]) Labels shape: torch.Size([1, 11403]) Final batch size: 1, sequence length: 30428 Attention mask shape: torch.Size([1, 1, 30428, 30428]) Position ids shape: torch.Size([1, 30428]) Input IDs shape: torch.Size([1, 30428]) Labels shape: torch.Size([1, 30428]) Final batch size: 1, sequence length: 18023 Attention mask shape: torch.Size([1, 1, 18023, 18023]) Position ids shape: torch.Size([1, 18023]) Input IDs shape: torch.Size([1, 18023]) Labels shape: torch.Size([1, 18023]) Final batch size: 1, sequence length: 28773 Attention mask shape: torch.Size([1, 1, 28773, 28773]) Position ids shape: torch.Size([1, 28773]) Input IDs shape: torch.Size([1, 28773]) Labels shape: torch.Size([1, 28773]) Final batch size: 1, sequence length: 35030 Attention mask shape: torch.Size([1, 1, 35030, 35030]) Position ids shape: torch.Size([1, 35030]) Input IDs shape: torch.Size([1, 35030]) Labels shape: torch.Size([1, 35030]) Final batch size: 1, sequence length: 30772 Attention mask shape: torch.Size([1, 1, 30772, 30772]) Position ids shape: torch.Size([1, 30772]) Input IDs shape: torch.Size([1, 30772]) Labels shape: torch.Size([1, 30772]) Final batch size: 1, sequence length: 30072 Attention mask shape: torch.Size([1, 1, 30072, 30072]) Position ids shape: torch.Size([1, 30072]) Input IDs shape: torch.Size([1, 30072]) Labels shape: torch.Size([1, 30072]) Final batch size: 1, sequence length: 37980 Attention mask shape: torch.Size([1, 1, 37980, 37980]) Position ids shape: torch.Size([1, 37980]) Input IDs shape: torch.Size([1, 37980]) Labels shape: torch.Size([1, 37980]) Final batch size: 1, sequence length: 39589 Attention mask shape: torch.Size([1, 1, 39589, 39589]) Position ids shape: torch.Size([1, 39589]) Input IDs shape: torch.Size([1, 39589]) Labels shape: torch.Size([1, 39589]) Final batch size: 1, sequence length: 21491 Attention mask shape: torch.Size([1, 1, 21491, 21491]) Position ids shape: torch.Size([1, 21491]) Input IDs shape: torch.Size([1, 21491]) Labels shape: torch.Size([1, 21491]) Final batch size: 1, sequence length: 29537 Attention mask shape: torch.Size([1, 1, 29537, 29537]) Position ids shape: torch.Size([1, 29537]) Input IDs shape: torch.Size([1, 29537]) Labels shape: torch.Size([1, 29537]) Final batch size: 1, sequence length: 30789 Attention mask shape: torch.Size([1, 1, 30789, 30789]) Position ids shape: torch.Size([1, 30789]) Input IDs shape: torch.Size([1, 30789]) Labels shape: torch.Size([1, 30789]) Final batch size: 1, sequence length: 17870 Attention mask shape: torch.Size([1, 1, 17870, 17870]) Position ids shape: torch.Size([1, 17870]) Input IDs shape: torch.Size([1, 17870]) Labels shape: torch.Size([1, 17870]) Final batch size: 1, sequence length: 18454 Attention mask shape: torch.Size([1, 1, 18454, 18454]) Position ids shape: torch.Size([1, 18454]) Input IDs shape: torch.Size([1, 18454]) Labels shape: torch.Size([1, 18454]) Final batch size: 1, sequence length: 19620 Attention mask shape: torch.Size([1, 1, 19620, 19620]) Position ids shape: torch.Size([1, 19620]) Input IDs shape: torch.Size([1, 19620]) Labels shape: torch.Size([1, 19620]) Final batch size: 1, sequence length: 29464 Attention mask shape: torch.Size([1, 1, 29464, 29464]) Position ids shape: torch.Size([1, 29464]) Input IDs shape: torch.Size([1, 29464]) Labels shape: torch.Size([1, 29464]) Final batch size: 1, sequence length: 26708 Attention mask shape: torch.Size([1, 1, 26708, 26708]) Position ids shape: torch.Size([1, 26708]) Input IDs shape: torch.Size([1, 26708]) Labels shape: torch.Size([1, 26708]) Final batch size: 1, sequence length: 30859 Attention mask shape: torch.Size([1, 1, 30859, 30859]) Position ids shape: torch.Size([1, 30859]) Input IDs shape: torch.Size([1, 30859]) Labels shape: torch.Size([1, 30859]) Final batch size: 1, sequence length: 17811 Attention mask shape: torch.Size([1, 1, 17811, 17811]) Position ids shape: torch.Size([1, 17811]) Input IDs shape: torch.Size([1, 17811]) Labels shape: torch.Size([1, 17811]) Final batch size: 1, sequence length: 19702 Attention mask shape: torch.Size([1, 1, 19702, 19702]) Position ids shape: torch.Size([1, 19702]) Input IDs shape: torch.Size([1, 19702]) Labels shape: torch.Size([1, 19702]) Final batch size: 1, sequence length: 26635 Attention mask shape: torch.Size([1, 1, 26635, 26635]) Position ids shape: torch.Size([1, 26635]) Input IDs shape: torch.Size([1, 26635]) Labels shape: torch.Size([1, 26635]) Final batch size: 1, sequence length: 13622 Attention mask shape: torch.Size([1, 1, 13622, 13622]) Position ids shape: torch.Size([1, 13622]) Input IDs shape: torch.Size([1, 13622]) Labels shape: torch.Size([1, 13622]) Final batch size: 1, sequence length: 26215 Attention mask shape: torch.Size([1, 1, 26215, 26215]) Position ids shape: torch.Size([1, 26215]) Input IDs shape: torch.Size([1, 26215]) Labels shape: torch.Size([1, 26215]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 37992 Attention mask shape: torch.Size([1, 1, 37992, 37992]) Position ids shape: torch.Size([1, 37992]) Input IDs shape: torch.Size([1, 37992]) Labels shape: torch.Size([1, 37992]) Final batch size: 1, sequence length: 27702 Attention mask shape: torch.Size([1, 1, 27702, 27702]) Position ids shape: torch.Size([1, 27702]) Input IDs shape: torch.Size([1, 27702]) Labels shape: torch.Size([1, 27702]) Final batch size: 1, sequence length: 21936 Attention mask shape: torch.Size([1, 1, 21936, 21936]) Position ids shape: torch.Size([1, 21936]) Input IDs shape: torch.Size([1, 21936]) Labels shape: torch.Size([1, 21936]) Final batch size: 1, sequence length: 19538 Attention mask shape: torch.Size([1, 1, 19538, 19538]) Position ids shape: torch.Size([1, 19538]) Input IDs shape: torch.Size([1, 19538]) Labels shape: torch.Size([1, 19538]) Final batch size: 1, sequence length: 21250 Attention mask shape: torch.Size([1, 1, 21250, 21250]) Position ids shape: torch.Size([1, 21250]) Input IDs shape: torch.Size([1, 21250]) Labels shape: torch.Size([1, 21250]) Final batch size: 1, sequence length: 29875 Attention mask shape: torch.Size([1, 1, 29875, 29875]) Position ids shape: torch.Size([1, 29875]) Input IDs shape: torch.Size([1, 29875]) Labels shape: torch.Size([1, 29875]) Final batch size: 1, sequence length: 38927 Attention mask shape: torch.Size([1, 1, 38927, 38927]) Position ids shape: torch.Size([1, 38927]) Input IDs shape: torch.Size([1, 38927]) Labels shape: torch.Size([1, 38927]) Final batch size: 1, sequence length: 22309 Attention mask shape: torch.Size([1, 1, 22309, 22309]) Position ids shape: torch.Size([1, 22309]) Input IDs shape: torch.Size([1, 22309]) Labels shape: torch.Size([1, 22309]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 33685 Attention mask shape: torch.Size([1, 1, 33685, 33685]) Position ids shape: torch.Size([1, 33685]) Input IDs shape: torch.Size([1, 33685]) Labels shape: torch.Size([1, 33685]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40526 Attention mask shape: torch.Size([1, 1, 40526, 40526]) Position ids shape: torch.Size([1, 40526]) Input IDs shape: torch.Size([1, 40526]) Labels shape: torch.Size([1, 40526]) Final batch size: 1, sequence length: 14730 Attention mask shape: torch.Size([1, 1, 14730, 14730]) Position ids shape: torch.Size([1, 14730]) Input IDs shape: torch.Size([1, 14730]) Labels shape: torch.Size([1, 14730]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 36580 Attention mask shape: torch.Size([1, 1, 36580, 36580]) Position ids shape: torch.Size([1, 36580]) Input IDs shape: torch.Size([1, 36580]) Labels shape: torch.Size([1, 36580]) Final batch size: 1, sequence length: 18606 Attention mask shape: torch.Size([1, 1, 18606, 18606]) Position ids shape: torch.Size([1, 18606]) Input IDs shape: torch.Size([1, 18606]) Labels shape: torch.Size([1, 18606]) Final batch size: 1, sequence length: 37866 Attention mask shape: torch.Size([1, 1, 37866, 37866]) Position ids shape: torch.Size([1, 37866]) Input IDs shape: torch.Size([1, 37866]) Labels shape: torch.Size([1, 37866]) Final batch size: 1, sequence length: 28814 Attention mask shape: torch.Size([1, 1, 28814, 28814]) Position ids shape: torch.Size([1, 28814]) Input IDs shape: torch.Size([1, 28814]) Labels shape: torch.Size([1, 28814]) Final batch size: 1, sequence length: 17623 Attention mask shape: torch.Size([1, 1, 17623, 17623]) Position ids shape: torch.Size([1, 17623]) Input IDs shape: torch.Size([1, 17623]) Labels shape: torch.Size([1, 17623]) Final batch size: 1, sequence length: 15993 Attention mask shape: torch.Size([1, 1, 15993, 15993]) Position ids shape: torch.Size([1, 15993]) Input IDs shape: torch.Size([1, 15993]) Labels shape: torch.Size([1, 15993]) Final batch size: 1, sequence length: 15830 Attention mask shape: torch.Size([1, 1, 15830, 15830]) Position ids shape: torch.Size([1, 15830]) Input IDs shape: torch.Size([1, 15830]) Labels shape: torch.Size([1, 15830]) Final batch size: 1, sequence length: 34684 Attention mask shape: torch.Size([1, 1, 34684, 34684]) Position ids shape: torch.Size([1, 34684]) Input IDs shape: torch.Size([1, 34684]) Labels shape: torch.Size([1, 34684]) Final batch size: 1, sequence length: 35077 Attention mask shape: torch.Size([1, 1, 35077, 35077]) Position ids shape: torch.Size([1, 35077]) Input IDs shape: torch.Size([1, 35077]) Labels shape: torch.Size([1, 35077]) Final batch size: 1, sequence length: 35952 Attention mask shape: torch.Size([1, 1, 35952, 35952]) Position ids shape: torch.Size([1, 35952]) Input IDs shape: torch.Size([1, 35952]) Labels shape: torch.Size([1, 35952]) Final batch size: 1, sequence length: 24232 Attention mask shape: torch.Size([1, 1, 24232, 24232]) Position ids shape: torch.Size([1, 24232]) Input IDs shape: torch.Size([1, 24232]) Labels shape: torch.Size([1, 24232]) Final batch size: 1, sequence length: 29639 Attention mask shape: torch.Size([1, 1, 29639, 29639]) Position ids shape: torch.Size([1, 29639]) Input IDs shape: torch.Size([1, 29639]) Labels shape: torch.Size([1, 29639]) Final batch size: 1, sequence length: 31745 Attention mask shape: torch.Size([1, 1, 31745, 31745]) Position ids shape: torch.Size([1, 31745]) Input IDs shape: torch.Size([1, 31745]) Labels shape: torch.Size([1, 31745]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 28810 Attention mask shape: torch.Size([1, 1, 28810, 28810]) Position ids shape: torch.Size([1, 28810]) Input IDs shape: torch.Size([1, 28810]) Labels shape: torch.Size([1, 28810]) Final batch size: 1, sequence length: 16270 Attention mask shape: torch.Size([1, 1, 16270, 16270]) Position ids shape: torch.Size([1, 16270]) Input IDs shape: torch.Size([1, 16270]) Labels shape: torch.Size([1, 16270]) Final batch size: 1, sequence length: 37407 Attention mask shape: torch.Size([1, 1, 37407, 37407]) Position ids shape: torch.Size([1, 37407]) Input IDs shape: torch.Size([1, 37407]) Labels shape: torch.Size([1, 37407]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 19028 Attention mask shape: torch.Size([1, 1, 19028, 19028]) Position ids shape: torch.Size([1, 19028]) Input IDs shape: torch.Size([1, 19028]) Labels shape: torch.Size([1, 19028]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 26930 Attention mask shape: torch.Size([1, 1, 26930, 26930]) Position ids shape: torch.Size([1, 26930]) Input IDs shape: torch.Size([1, 26930]) Labels shape: torch.Size([1, 26930]) Final batch size: 1, sequence length: 17778 Attention mask shape: torch.Size([1, 1, 17778, 17778]) Position ids shape: torch.Size([1, 17778]) Input IDs shape: torch.Size([1, 17778]) Labels shape: torch.Size([1, 17778]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 25388 Attention mask shape: torch.Size([1, 1, 25388, 25388]) Position ids shape: torch.Size([1, 25388]) Input IDs shape: torch.Size([1, 25388]) Labels shape: torch.Size([1, 25388]) Final batch size: 1, sequence length: 36716 Attention mask shape: torch.Size([1, 1, 36716, 36716]) Position ids shape: torch.Size([1, 36716]) Input IDs shape: torch.Size([1, 36716]) Labels shape: torch.Size([1, 36716]) Final batch size: 1, sequence length: 14104 Attention mask shape: torch.Size([1, 1, 14104, 14104]) Position ids shape: torch.Size([1, 14104]) Input IDs shape: torch.Size([1, 14104]) Labels shape: torch.Size([1, 14104]) Final batch size: 1, sequence length: 20057 Attention mask shape: torch.Size([1, 1, 20057, 20057]) Position ids shape: torch.Size([1, 20057]) Input IDs shape: torch.Size([1, 20057]) Labels shape: torch.Size([1, 20057]) Final batch size: 1, sequence length: 39661 Attention mask shape: torch.Size([1, 1, 39661, 39661]) Position ids shape: torch.Size([1, 39661]) Input IDs shape: torch.Size([1, 39661]) Labels shape: torch.Size([1, 39661]) Final batch size: 1, sequence length: 21071 Attention mask shape: torch.Size([1, 1, 21071, 21071]) Position ids shape: torch.Size([1, 21071]) Input IDs shape: torch.Size([1, 21071]) Labels shape: torch.Size([1, 21071]) Final batch size: 1, sequence length: 20765 Attention mask shape: torch.Size([1, 1, 20765, 20765]) Position ids shape: torch.Size([1, 20765]) Input IDs shape: torch.Size([1, 20765]) Labels shape: torch.Size([1, 20765]) Final batch size: 1, sequence length: 30135 Attention mask shape: torch.Size([1, 1, 30135, 30135]) Position ids shape: torch.Size([1, 30135]) Input IDs shape: torch.Size([1, 30135]) Labels shape: torch.Size([1, 30135]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 31381 Attention mask shape: torch.Size([1, 1, 31381, 31381]) Position ids shape: torch.Size([1, 31381]) Input IDs shape: torch.Size([1, 31381]) Labels shape: torch.Size([1, 31381]) Final batch size: 1, sequence length: 17971 Attention mask shape: torch.Size([1, 1, 17971, 17971]) Position ids shape: torch.Size([1, 17971]) Input IDs shape: torch.Size([1, 17971]) Labels shape: torch.Size([1, 17971]) Final batch size: 1, sequence length: 27281 Attention mask shape: torch.Size([1, 1, 27281, 27281]) Position ids shape: torch.Size([1, 27281]) Input IDs shape: torch.Size([1, 27281]) Labels shape: torch.Size([1, 27281]) Final batch size: 1, sequence length: 22945 Attention mask shape: torch.Size([1, 1, 22945, 22945]) Position ids shape: torch.Size([1, 22945]) Input IDs shape: torch.Size([1, 22945]) Labels shape: torch.Size([1, 22945]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 26665 Attention mask shape: torch.Size([1, 1, 26665, 26665]) Position ids shape: torch.Size([1, 26665]) Input IDs shape: torch.Size([1, 26665]) Labels shape: torch.Size([1, 26665]) Final batch size: 1, sequence length: 21805 Attention mask shape: torch.Size([1, 1, 21805, 21805]) Position ids shape: torch.Size([1, 21805]) Input IDs shape: torch.Size([1, 21805]) Labels shape: torch.Size([1, 21805]) Final batch size: 1, sequence length: 31420 Attention mask shape: torch.Size([1, 1, 31420, 31420]) Position ids shape: torch.Size([1, 31420]) Input IDs shape: torch.Size([1, 31420]) Labels shape: torch.Size([1, 31420]) Final batch size: 1, sequence length: 21006 Attention mask shape: torch.Size([1, 1, 21006, 21006]) Position ids shape: torch.Size([1, 21006]) Input IDs shape: torch.Size([1, 21006]) Labels shape: torch.Size([1, 21006]) Final batch size: 1, sequence length: 13903 Attention mask shape: torch.Size([1, 1, 13903, 13903]) Position ids shape: torch.Size([1, 13903]) Input IDs shape: torch.Size([1, 13903]) Labels shape: torch.Size([1, 13903]) Final batch size: 1, sequence length: 25032 Attention mask shape: torch.Size([1, 1, 25032, 25032]) Position ids shape: torch.Size([1, 25032]) Input IDs shape: torch.Size([1, 25032]) Labels shape: torch.Size([1, 25032]) Final batch size: 1, sequence length: 37397 Attention mask shape: torch.Size([1, 1, 37397, 37397]) Position ids shape: torch.Size([1, 37397]) Input IDs shape: torch.Size([1, 37397]) Labels shape: torch.Size([1, 37397]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 23187 Attention mask shape: torch.Size([1, 1, 23187, 23187]) Position ids shape: torch.Size([1, 23187]) Input IDs shape: torch.Size([1, 23187]) Labels shape: torch.Size([1, 23187]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 32024 Attention mask shape: torch.Size([1, 1, 32024, 32024]) Position ids shape: torch.Size([1, 32024]) Input IDs shape: torch.Size([1, 32024]) Labels shape: torch.Size([1, 32024]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 39778 Attention mask shape: torch.Size([1, 1, 39778, 39778]) Position ids shape: torch.Size([1, 39778]) Input IDs shape: torch.Size([1, 39778]) Labels shape: torch.Size([1, 39778]) Final batch size: 1, sequence length: 35879 Attention mask shape: torch.Size([1, 1, 35879, 35879]) Position ids shape: torch.Size([1, 35879]) Input IDs shape: torch.Size([1, 35879]) Labels shape: torch.Size([1, 35879]) Final batch size: 1, sequence length: 40579 Attention mask shape: torch.Size([1, 1, 40579, 40579]) Position ids shape: torch.Size([1, 40579]) Input IDs shape: torch.Size([1, 40579]) Labels shape: torch.Size([1, 40579]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 36415 Attention mask shape: torch.Size([1, 1, 36415, 36415]) Position ids shape: torch.Size([1, 36415]) Input IDs shape: torch.Size([1, 36415]) Labels shape: torch.Size([1, 36415]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 18566 Attention mask shape: torch.Size([1, 1, 18566, 18566]) Position ids shape: torch.Size([1, 18566]) Input IDs shape: torch.Size([1, 18566]) Labels shape: torch.Size([1, 18566]) Final batch size: 1, sequence length: 12200 Attention mask shape: torch.Size([1, 1, 12200, 12200]) Position ids shape: torch.Size([1, 12200]) Input IDs shape: torch.Size([1, 12200]) Labels shape: torch.Size([1, 12200]) Final batch size: 1, sequence length: 37530 Attention mask shape: torch.Size([1, 1, 37530, 37530]) Position ids shape: torch.Size([1, 37530]) Input IDs shape: torch.Size([1, 37530]) Labels shape: torch.Size([1, 37530]) Final batch size: 1, sequence length: 37890 Attention mask shape: torch.Size([1, 1, 37890, 37890]) Position ids shape: torch.Size([1, 37890]) Input IDs shape: torch.Size([1, 37890]) Labels shape: torch.Size([1, 37890]) Final batch size: 1, sequence length: 19668 Attention mask shape: torch.Size([1, 1, 19668, 19668]) Position ids shape: torch.Size([1, 19668]) Final batch size: 1, sequence length: 25383 Input IDs shape: torch.Size([1, 19668]) Labels shape: torch.Size([1, 19668]) Attention mask shape: torch.Size([1, 1, 25383, 25383]) Position ids shape: torch.Size([1, 25383]) Input IDs shape: torch.Size([1, 25383]) Labels shape: torch.Size([1, 25383]) Final batch size: 1, sequence length: 39587 Attention mask shape: torch.Size([1, 1, 39587, 39587]) Position ids shape: torch.Size([1, 39587]) Input IDs shape: torch.Size([1, 39587]) Labels shape: torch.Size([1, 39587]) Final batch size: 1, sequence length: 32541 Attention mask shape: torch.Size([1, 1, 32541, 32541]) Position ids shape: torch.Size([1, 32541]) Input IDs shape: torch.Size([1, 32541]) Labels shape: torch.Size([1, 32541]) Final batch size: 1, sequence length: 39579 Attention mask shape: torch.Size([1, 1, 39579, 39579]) Position ids shape: torch.Size([1, 39579]) Input IDs shape: torch.Size([1, 39579]) Labels shape: torch.Size([1, 39579]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 14308 Attention mask shape: torch.Size([1, 1, 14308, 14308]) Position ids shape: torch.Size([1, 14308]) Input IDs shape: torch.Size([1, 14308]) Labels shape: torch.Size([1, 14308]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 36647 Attention mask shape: torch.Size([1, 1, 36647, 36647]) Position ids shape: torch.Size([1, 36647]) Input IDs shape: torch.Size([1, 36647]) Labels shape: torch.Size([1, 36647]) Final batch size: 1, sequence length: 9251 Attention mask shape: torch.Size([1, 1, 9251, 9251]) Position ids shape: torch.Size([1, 9251]) Input IDs shape: torch.Size([1, 9251]) Labels shape: torch.Size([1, 9251]) {'loss': 0.2304, 'grad_norm': 0.1690609279451604, 'learning_rate': 4.322727117869951e-07, 'num_tokens': -inf, 'epoch': 7.12} Final batch size: 1, sequence length: 4858 Attention mask shape: torch.Size([1, 1, 4858, 4858]) Position ids shape: torch.Size([1, 4858]) Input IDs shape: torch.Size([1, 4858]) Labels shape: torch.Size([1, 4858]) Final batch size: 1, sequence length: 6316 Attention mask shape: torch.Size([1, 1, 6316, 6316]) Position ids shape: torch.Size([1, 6316]) Input IDs shape: torch.Size([1, 6316]) Labels shape: torch.Size([1, 6316]) Final batch size: 1, sequence length: 11448 Attention mask shape: torch.Size([1, 1, 11448, 11448]) Position ids shape: torch.Size([1, 11448]) Input IDs shape: torch.Size([1, 11448]) Labels shape: torch.Size([1, 11448]) Final batch size: 1, sequence length: 12846 Attention mask shape: torch.Size([1, 1, 12846, 12846]) Position ids shape: torch.Size([1, 12846]) Input IDs shape: torch.Size([1, 12846]) Labels shape: torch.Size([1, 12846]) Final batch size: 1, sequence length: 12075 Attention mask shape: torch.Size([1, 1, 12075, 12075]) Position ids shape: torch.Size([1, 12075]) Input IDs shape: torch.Size([1, 12075]) Labels shape: torch.Size([1, 12075]) Final batch size: 1, sequence length: 7360 Attention mask shape: torch.Size([1, 1, 7360, 7360]) Position ids shape: torch.Size([1, 7360]) Input IDs shape: torch.Size([1, 7360]) Labels shape: torch.Size([1, 7360]) Final batch size: 1, sequence length: 12945 Attention mask shape: torch.Size([1, 1, 12945, 12945]) Position ids shape: torch.Size([1, 12945]) Input IDs shape: torch.Size([1, 12945]) Labels shape: torch.Size([1, 12945]) Final batch size: 1, sequence length: 14891 Attention mask shape: torch.Size([1, 1, 14891, 14891]) Position ids shape: torch.Size([1, 14891]) Input IDs shape: torch.Size([1, 14891]) Labels shape: torch.Size([1, 14891]) Final batch size: 1, sequence length: 15189 Attention mask shape: torch.Size([1, 1, 15189, 15189]) Position ids shape: torch.Size([1, 15189]) Input IDs shape: torch.Size([1, 15189]) Labels shape: torch.Size([1, 15189]) Final batch size: 1, sequence length: 14330 Final batch size: 1, sequence length: 16658 Attention mask shape: torch.Size([1, 1, 14330, 14330]) Attention mask shape: torch.Size([1, 1, 16658, 16658]) Position ids shape: torch.Size([1, 14330]) Input IDs shape: torch.Size([1, 14330]) Labels shape: torch.Size([1, 14330]) Position ids shape: torch.Size([1, 16658]) Input IDs shape: torch.Size([1, 16658]) Labels shape: torch.Size([1, 16658]) Final batch size: 1, sequence length: 16391 Attention mask shape: torch.Size([1, 1, 16391, 16391]) Position ids shape: torch.Size([1, 16391]) Input IDs shape: torch.Size([1, 16391]) Labels shape: torch.Size([1, 16391]) Final batch size: 1, sequence length: 16961 Attention mask shape: torch.Size([1, 1, 16961, 16961]) Position ids shape: torch.Size([1, 16961]) Input IDs shape: torch.Size([1, 16961]) Labels shape: torch.Size([1, 16961]) Final batch size: 1, sequence length: 17246 Attention mask shape: torch.Size([1, 1, 17246, 17246]) Position ids shape: torch.Size([1, 17246]) Input IDs shape: torch.Size([1, 17246]) Labels shape: torch.Size([1, 17246]) Final batch size: 1, sequence length: 18400 Attention mask shape: torch.Size([1, 1, 18400, 18400]) Position ids shape: torch.Size([1, 18400]) Input IDs shape: torch.Size([1, 18400]) Labels shape: torch.Size([1, 18400]) Final batch size: 1, sequence length: 16536 Attention mask shape: torch.Size([1, 1, 16536, 16536]) Position ids shape: torch.Size([1, 16536]) Input IDs shape: torch.Size([1, 16536]) Labels shape: torch.Size([1, 16536]) Final batch size: 1, sequence length: 16326 Attention mask shape: torch.Size([1, 1, 16326, 16326]) Position ids shape: torch.Size([1, 16326]) Input IDs shape: torch.Size([1, 16326]) Labels shape: torch.Size([1, 16326]) Final batch size: 1, sequence length: 19999 Attention mask shape: torch.Size([1, 1, 19999, 19999]) Position ids shape: torch.Size([1, 19999]) Input IDs shape: torch.Size([1, 19999]) Labels shape: torch.Size([1, 19999]) Final batch size: 1, sequence length: 18341 Attention mask shape: torch.Size([1, 1, 18341, 18341]) Position ids shape: torch.Size([1, 18341]) Input IDs shape: torch.Size([1, 18341]) Labels shape: torch.Size([1, 18341]) Final batch size: 1, sequence length: 19332 Attention mask shape: torch.Size([1, 1, 19332, 19332]) Position ids shape: torch.Size([1, 19332]) Input IDs shape: torch.Size([1, 19332]) Labels shape: torch.Size([1, 19332]) Final batch size: 1, sequence length: 18424 Attention mask shape: torch.Size([1, 1, 18424, 18424]) Position ids shape: torch.Size([1, 18424]) Input IDs shape: torch.Size([1, 18424]) Labels shape: torch.Size([1, 18424]) Final batch size: 1, sequence length: 18014 Attention mask shape: torch.Size([1, 1, 18014, 18014]) Position ids shape: torch.Size([1, 18014]) Input IDs shape: torch.Size([1, 18014]) Labels shape: torch.Size([1, 18014]) Final batch size: 1, sequence length: 19597 Attention mask shape: torch.Size([1, 1, 19597, 19597]) Position ids shape: torch.Size([1, 19597]) Input IDs shape: torch.Size([1, 19597]) Labels shape: torch.Size([1, 19597]) Final batch size: 1, sequence length: 20912 Attention mask shape: torch.Size([1, 1, 20912, 20912]) Position ids shape: torch.Size([1, 20912]) Input IDs shape: torch.Size([1, 20912]) Labels shape: torch.Size([1, 20912]) Final batch size: 1, sequence length: 21617 Attention mask shape: torch.Size([1, 1, 21617, 21617]) Position ids shape: torch.Size([1, 21617]) Input IDs shape: torch.Size([1, 21617]) Labels shape: torch.Size([1, 21617]) Final batch size: 1, sequence length: 20118 Attention mask shape: torch.Size([1, 1, 20118, 20118]) Position ids shape: torch.Size([1, 20118]) Input IDs shape: torch.Size([1, 20118]) Labels shape: torch.Size([1, 20118]) Final batch size: 1, sequence length: 21611 Attention mask shape: torch.Size([1, 1, 21611, 21611]) Position ids shape: torch.Size([1, 21611]) Input IDs shape: torch.Size([1, 21611]) Labels shape: torch.Size([1, 21611]) Final batch size: 1, sequence length: 21122 Attention mask shape: torch.Size([1, 1, 21122, 21122]) Position ids shape: torch.Size([1, 21122]) Input IDs shape: torch.Size([1, 21122]) Labels shape: torch.Size([1, 21122]) Final batch size: 1, sequence length: 18393 Attention mask shape: torch.Size([1, 1, 18393, 18393]) Position ids shape: torch.Size([1, 18393]) Input IDs shape: torch.Size([1, 18393]) Labels shape: torch.Size([1, 18393]) Final batch size: 1, sequence length: 24597 Attention mask shape: torch.Size([1, 1, 24597, 24597]) Position ids shape: torch.Size([1, 24597]) Input IDs shape: torch.Size([1, 24597]) Labels shape: torch.Size([1, 24597]) Final batch size: 1, sequence length: 20888 Attention mask shape: torch.Size([1, 1, 20888, 20888]) Position ids shape: torch.Size([1, 20888]) Input IDs shape: torch.Size([1, 20888]) Labels shape: torch.Size([1, 20888]) Final batch size: 1, sequence length: 13638 Attention mask shape: torch.Size([1, 1, 13638, 13638]) Position ids shape: torch.Size([1, 13638]) Input IDs shape: torch.Size([1, 13638]) Labels shape: torch.Size([1, 13638]) Final batch size: 1, sequence length: 22854 Attention mask shape: torch.Size([1, 1, 22854, 22854]) Position ids shape: torch.Size([1, 22854]) Input IDs shape: torch.Size([1, 22854]) Labels shape: torch.Size([1, 22854]) Final batch size: 1, sequence length: 21314 Attention mask shape: torch.Size([1, 1, 21314, 21314]) Position ids shape: torch.Size([1, 21314]) Input IDs shape: torch.Size([1, 21314]) Labels shape: torch.Size([1, 21314]) Final batch size: 1, sequence length: 9341 Attention mask shape: torch.Size([1, 1, 9341, 9341]) Position ids shape: torch.Size([1, 9341]) Input IDs shape: torch.Size([1, 9341]) Labels shape: torch.Size([1, 9341]) Final batch size: 1, sequence length: 6839 Attention mask shape: torch.Size([1, 1, 6839, 6839]) Position ids shape: torch.Size([1, 6839]) Input IDs shape: torch.Size([1, 6839]) Labels shape: torch.Size([1, 6839]) Final batch size: 1, sequence length: 16831 Attention mask shape: torch.Size([1, 1, 16831, 16831]) Position ids shape: torch.Size([1, 16831]) Input IDs shape: torch.Size([1, 16831]) Labels shape: torch.Size([1, 16831]) Final batch size: 1, sequence length: 21739 Attention mask shape: torch.Size([1, 1, 21739, 21739]) Position ids shape: torch.Size([1, 21739]) Input IDs shape: torch.Size([1, 21739]) Labels shape: torch.Size([1, 21739]) Final batch size: 1, sequence length: 25909 Attention mask shape: torch.Size([1, 1, 25909, 25909]) Position ids shape: torch.Size([1, 25909]) Input IDs shape: torch.Size([1, 25909]) Labels shape: torch.Size([1, 25909]) Final batch size: 1, sequence length: 25575 Attention mask shape: torch.Size([1, 1, 25575, 25575]) Position ids shape: torch.Size([1, 25575]) Input IDs shape: torch.Size([1, 25575]) Labels shape: torch.Size([1, 25575]) Final batch size: 1, sequence length: 12328 Attention mask shape: torch.Size([1, 1, 12328, 12328]) Position ids shape: torch.Size([1, 12328]) Input IDs shape: torch.Size([1, 12328]) Labels shape: torch.Size([1, 12328]) Final batch size: 1, sequence length: 25042 Attention mask shape: torch.Size([1, 1, 25042, 25042]) Position ids shape: torch.Size([1, 25042]) Input IDs shape: torch.Size([1, 25042]) Labels shape: torch.Size([1, 25042]) Final batch size: 1, sequence length: 15232 Attention mask shape: torch.Size([1, 1, 15232, 15232]) Position ids shape: torch.Size([1, 15232]) Input IDs shape: torch.Size([1, 15232]) Labels shape: torch.Size([1, 15232]) Final batch size: 1, sequence length: 20619 Attention mask shape: torch.Size([1, 1, 20619, 20619]) Position ids shape: torch.Size([1, 20619]) Input IDs shape: torch.Size([1, 20619]) Labels shape: torch.Size([1, 20619]) Final batch size: 1, sequence length: 23614 Attention mask shape: torch.Size([1, 1, 23614, 23614]) Position ids shape: torch.Size([1, 23614]) Input IDs shape: torch.Size([1, 23614]) Labels shape: torch.Size([1, 23614]) Final batch size: 1, sequence length: 15026 Attention mask shape: torch.Size([1, 1, 15026, 15026]) Position ids shape: torch.Size([1, 15026]) Input IDs shape: torch.Size([1, 15026]) Labels shape: torch.Size([1, 15026]) Final batch size: 1, sequence length: 16611 Attention mask shape: torch.Size([1, 1, 16611, 16611]) Position ids shape: torch.Size([1, 16611]) Input IDs shape: torch.Size([1, 16611]) Labels shape: torch.Size([1, 16611]) Final batch size: 1, sequence length: 20702 Attention mask shape: torch.Size([1, 1, 20702, 20702]) Position ids shape: torch.Size([1, 20702]) Input IDs shape: torch.Size([1, 20702]) Labels shape: torch.Size([1, 20702]) Final batch size: 1, sequence length: 28348 Attention mask shape: torch.Size([1, 1, 28348, 28348]) Position ids shape: torch.Size([1, 28348]) Input IDs shape: torch.Size([1, 28348]) Labels shape: torch.Size([1, 28348]) Final batch size: 1, sequence length: 28678 Attention mask shape: torch.Size([1, 1, 28678, 28678]) Position ids shape: torch.Size([1, 28678]) Input IDs shape: torch.Size([1, 28678]) Labels shape: torch.Size([1, 28678]) Final batch size: 1, sequence length: 19885 Attention mask shape: torch.Size([1, 1, 19885, 19885]) Position ids shape: torch.Size([1, 19885]) Input IDs shape: torch.Size([1, 19885]) Labels shape: torch.Size([1, 19885]) Final batch size: 1, sequence length: 21725 Attention mask shape: torch.Size([1, 1, 21725, 21725]) Position ids shape: torch.Size([1, 21725]) Input IDs shape: torch.Size([1, 21725]) Labels shape: torch.Size([1, 21725]) Final batch size: 1, sequence length: 13646 Attention mask shape: torch.Size([1, 1, 13646, 13646]) Position ids shape: torch.Size([1, 13646]) Input IDs shape: torch.Size([1, 13646]) Labels shape: torch.Size([1, 13646]) Final batch size: 1, sequence length: 21677 Attention mask shape: torch.Size([1, 1, 21677, 21677]) Position ids shape: torch.Size([1, 21677]) Input IDs shape: torch.Size([1, 21677]) Labels shape: torch.Size([1, 21677]) Final batch size: 1, sequence length: 24090 Attention mask shape: torch.Size([1, 1, 24090, 24090]) Position ids shape: torch.Size([1, 24090]) Input IDs shape: torch.Size([1, 24090]) Labels shape: torch.Size([1, 24090]) Final batch size: 1, sequence length: 26218 Attention mask shape: torch.Size([1, 1, 26218, 26218]) Position ids shape: torch.Size([1, 26218]) Input IDs shape: torch.Size([1, 26218]) Labels shape: torch.Size([1, 26218]) Final batch size: 1, sequence length: 29730 Attention mask shape: torch.Size([1, 1, 29730, 29730]) Position ids shape: torch.Size([1, 29730]) Input IDs shape: torch.Size([1, 29730]) Labels shape: torch.Size([1, 29730]) Final batch size: 1, sequence length: 24342 Attention mask shape: torch.Size([1, 1, 24342, 24342]) Position ids shape: torch.Size([1, 24342]) Input IDs shape: torch.Size([1, 24342]) Labels shape: torch.Size([1, 24342]) Final batch size: 1, sequence length: 22217 Attention mask shape: torch.Size([1, 1, 22217, 22217]) Position ids shape: torch.Size([1, 22217]) Input IDs shape: torch.Size([1, 22217]) Labels shape: torch.Size([1, 22217]) Final batch size: 1, sequence length: 32034 Attention mask shape: torch.Size([1, 1, 32034, 32034]) Position ids shape: torch.Size([1, 32034]) Input IDs shape: torch.Size([1, 32034]) Labels shape: torch.Size([1, 32034]) Final batch size: 1, sequence length: 21623 Attention mask shape: torch.Size([1, 1, 21623, 21623]) Position ids shape: torch.Size([1, 21623]) Input IDs shape: torch.Size([1, 21623]) Labels shape: torch.Size([1, 21623]) Final batch size: 1, sequence length: 27795 Attention mask shape: torch.Size([1, 1, 27795, 27795]) Position ids shape: torch.Size([1, 27795]) Input IDs shape: torch.Size([1, 27795]) Labels shape: torch.Size([1, 27795]) Final batch size: 1, sequence length: 17780 Attention mask shape: torch.Size([1, 1, 17780, 17780]) Position ids shape: torch.Size([1, 17780]) Input IDs shape: torch.Size([1, 17780]) Labels shape: torch.Size([1, 17780]) Final batch size: 1, sequence length: 20915 Attention mask shape: torch.Size([1, 1, 20915, 20915]) Position ids shape: torch.Size([1, 20915]) Input IDs shape: torch.Size([1, 20915]) Labels shape: torch.Size([1, 20915]) Final batch size: 1, sequence length: 30766 Attention mask shape: torch.Size([1, 1, 30766, 30766]) Position ids shape: torch.Size([1, 30766]) Input IDs shape: torch.Size([1, 30766]) Labels shape: torch.Size([1, 30766]) Final batch size: 1, sequence length: 30410 Attention mask shape: torch.Size([1, 1, 30410, 30410]) Position ids shape: torch.Size([1, 30410]) Input IDs shape: torch.Size([1, 30410]) Labels shape: torch.Size([1, 30410]) Final batch size: 1, sequence length: 16403 Attention mask shape: torch.Size([1, 1, 16403, 16403]) Position ids shape: torch.Size([1, 16403]) Input IDs shape: torch.Size([1, 16403]) Labels shape: torch.Size([1, 16403]) Final batch size: 1, sequence length: 30917 Attention mask shape: torch.Size([1, 1, 30917, 30917]) Position ids shape: torch.Size([1, 30917]) Input IDs shape: torch.Size([1, 30917]) Labels shape: torch.Size([1, 30917]) Final batch size: 1, sequence length: 25698 Attention mask shape: torch.Size([1, 1, 25698, 25698]) Position ids shape: torch.Size([1, 25698]) Input IDs shape: torch.Size([1, 25698]) Labels shape: torch.Size([1, 25698]) Final batch size: 1, sequence length: 23626 Attention mask shape: torch.Size([1, 1, 23626, 23626]) Position ids shape: torch.Size([1, 23626]) Input IDs shape: torch.Size([1, 23626]) Labels shape: torch.Size([1, 23626]) Final batch size: 1, sequence length: 30366 Attention mask shape: torch.Size([1, 1, 30366, 30366]) Position ids shape: torch.Size([1, 30366]) Input IDs shape: torch.Size([1, 30366]) Labels shape: torch.Size([1, 30366]) Final batch size: 1, sequence length: 35781 Attention mask shape: torch.Size([1, 1, 35781, 35781]) Position ids shape: torch.Size([1, 35781]) Input IDs shape: torch.Size([1, 35781]) Labels shape: torch.Size([1, 35781]) Final batch size: 1, sequence length: 30796 Attention mask shape: torch.Size([1, 1, 30796, 30796]) Position ids shape: torch.Size([1, 30796]) Input IDs shape: torch.Size([1, 30796]) Labels shape: torch.Size([1, 30796]) Final batch size: 1, sequence length: 35697 Attention mask shape: torch.Size([1, 1, 35697, 35697]) Position ids shape: torch.Size([1, 35697]) Input IDs shape: torch.Size([1, 35697]) Labels shape: torch.Size([1, 35697]) Final batch size: 1, sequence length: 35628 Attention mask shape: torch.Size([1, 1, 35628, 35628]) Position ids shape: torch.Size([1, 35628]) Input IDs shape: torch.Size([1, 35628]) Labels shape: torch.Size([1, 35628]) Final batch size: 1, sequence length: 20028 Attention mask shape: torch.Size([1, 1, 20028, 20028]) Position ids shape: torch.Size([1, 20028]) Input IDs shape: torch.Size([1, 20028]) Labels shape: torch.Size([1, 20028]) Final batch size: 1, sequence length: 34802 Attention mask shape: torch.Size([1, 1, 34802, 34802]) Position ids shape: torch.Size([1, 34802]) Input IDs shape: torch.Size([1, 34802]) Labels shape: torch.Size([1, 34802]) Final batch size: 1, sequence length: 32308 Attention mask shape: torch.Size([1, 1, 32308, 32308]) Position ids shape: torch.Size([1, 32308]) Input IDs shape: torch.Size([1, 32308]) Labels shape: torch.Size([1, 32308]) Final batch size: 1, sequence length: 15726 Attention mask shape: torch.Size([1, 1, 15726, 15726]) Position ids shape: torch.Size([1, 15726]) Input IDs shape: torch.Size([1, 15726]) Labels shape: torch.Size([1, 15726]) Final batch size: 1, sequence length: 21562 Attention mask shape: torch.Size([1, 1, 21562, 21562]) Position ids shape: torch.Size([1, 21562]) Input IDs shape: torch.Size([1, 21562]) Labels shape: torch.Size([1, 21562]) Final batch size: 1, sequence length: 32239 Attention mask shape: torch.Size([1, 1, 32239, 32239]) Position ids shape: torch.Size([1, 32239]) Input IDs shape: torch.Size([1, 32239]) Labels shape: torch.Size([1, 32239]) Final batch size: 1, sequence length: 38190 Attention mask shape: torch.Size([1, 1, 38190, 38190]) Position ids shape: torch.Size([1, 38190]) Input IDs shape: torch.Size([1, 38190]) Labels shape: torch.Size([1, 38190]) Final batch size: 1, sequence length: 37195 Attention mask shape: torch.Size([1, 1, 37195, 37195]) Position ids shape: torch.Size([1, 37195]) Input IDs shape: torch.Size([1, 37195]) Labels shape: torch.Size([1, 37195]) Final batch size: 1, sequence length: 14070 Attention mask shape: torch.Size([1, 1, 14070, 14070]) Position ids shape: torch.Size([1, 14070]) Input IDs shape: torch.Size([1, 14070]) Labels shape: torch.Size([1, 14070]) Final batch size: 1, sequence length: 22768 Attention mask shape: torch.Size([1, 1, 22768, 22768]) Position ids shape: torch.Size([1, 22768]) Input IDs shape: torch.Size([1, 22768]) Labels shape: torch.Size([1, 22768]) Final batch size: 1, sequence length: 38420 Attention mask shape: torch.Size([1, 1, 38420, 38420]) Position ids shape: torch.Size([1, 38420]) Input IDs shape: torch.Size([1, 38420]) Labels shape: torch.Size([1, 38420]) Final batch size: 1, sequence length: 38362 Attention mask shape: torch.Size([1, 1, 38362, 38362]) Position ids shape: torch.Size([1, 38362]) Input IDs shape: torch.Size([1, 38362]) Labels shape: torch.Size([1, 38362]) Final batch size: 1, sequence length: 39608 Attention mask shape: torch.Size([1, 1, 39608, 39608]) Position ids shape: torch.Size([1, 39608]) Input IDs shape: torch.Size([1, 39608]) Labels shape: torch.Size([1, 39608]) Final batch size: 1, sequence length: 32630 Attention mask shape: torch.Size([1, 1, 32630, 32630]) Position ids shape: torch.Size([1, 32630]) Input IDs shape: torch.Size([1, 32630]) Labels shape: torch.Size([1, 32630]) Final batch size: 1, sequence length: 31143 Attention mask shape: torch.Size([1, 1, 31143, 31143]) Position ids shape: torch.Size([1, 31143]) Input IDs shape: torch.Size([1, 31143]) Labels shape: torch.Size([1, 31143]) Final batch size: 1, sequence length: 14828 Attention mask shape: torch.Size([1, 1, 14828, 14828]) Position ids shape: torch.Size([1, 14828]) Input IDs shape: torch.Size([1, 14828]) Labels shape: torch.Size([1, 14828]) Final batch size: 1, sequence length: 40088 Attention mask shape: torch.Size([1, 1, 40088, 40088]) Position ids shape: torch.Size([1, 40088]) Input IDs shape: torch.Size([1, 40088]) Labels shape: torch.Size([1, 40088]) Final batch size: 1, sequence length: 22199 Attention mask shape: torch.Size([1, 1, 22199, 22199]) Position ids shape: torch.Size([1, 22199]) Input IDs shape: torch.Size([1, 22199]) Labels shape: torch.Size([1, 22199]) Final batch size: 1, sequence length: 11896 Attention mask shape: torch.Size([1, 1, 11896, 11896]) Position ids shape: torch.Size([1, 11896]) Input IDs shape: torch.Size([1, 11896]) Labels shape: torch.Size([1, 11896]) Final batch size: 1, sequence length: 18568 Attention mask shape: torch.Size([1, 1, 18568, 18568]) Position ids shape: torch.Size([1, 18568]) Input IDs shape: torch.Size([1, 18568]) Labels shape: torch.Size([1, 18568]) Final batch size: 1, sequence length: 34785 Attention mask shape: torch.Size([1, 1, 34785, 34785]) Position ids shape: torch.Size([1, 34785]) Input IDs shape: torch.Size([1, 34785]) Labels shape: torch.Size([1, 34785]) Final batch size: 1, sequence length: 26840 Attention mask shape: torch.Size([1, 1, 26840, 26840]) Position ids shape: torch.Size([1, 26840]) Input IDs shape: torch.Size([1, 26840]) Labels shape: torch.Size([1, 26840]) Final batch size: 1, sequence length: 10426 Attention mask shape: torch.Size([1, 1, 10426, 10426]) Position ids shape: torch.Size([1, 10426]) Input IDs shape: torch.Size([1, 10426]) Labels shape: torch.Size([1, 10426]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 30651 Attention mask shape: torch.Size([1, 1, 30651, 30651]) Position ids shape: torch.Size([1, 30651]) Input IDs shape: torch.Size([1, 30651]) Labels shape: torch.Size([1, 30651]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 37529 Attention mask shape: torch.Size([1, 1, 37529, 37529]) Position ids shape: torch.Size([1, 37529]) Input IDs shape: torch.Size([1, 37529]) Labels shape: torch.Size([1, 37529]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 35914 Attention mask shape: torch.Size([1, 1, 35914, 35914]) Position ids shape: torch.Size([1, 35914]) Input IDs shape: torch.Size([1, 35914]) Labels shape: torch.Size([1, 35914]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 16304 Attention mask shape: torch.Size([1, 1, 16304, 16304]) Position ids shape: torch.Size([1, 16304]) Input IDs shape: torch.Size([1, 16304]) Labels shape: torch.Size([1, 16304]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 26287 Attention mask shape: torch.Size([1, 1, 26287, 26287]) Position ids shape: torch.Size([1, 26287]) Input IDs shape: torch.Size([1, 26287]) Labels shape: torch.Size([1, 26287]) Final batch size: 1, sequence length: 18859 Attention mask shape: torch.Size([1, 1, 18859, 18859]) Position ids shape: torch.Size([1, 18859]) Input IDs shape: torch.Size([1, 18859]) Labels shape: torch.Size([1, 18859]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 29202 Attention mask shape: torch.Size([1, 1, 29202, 29202]) Position ids shape: torch.Size([1, 29202]) Input IDs shape: torch.Size([1, 29202]) Labels shape: torch.Size([1, 29202]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 10198 Attention mask shape: torch.Size([1, 1, 10198, 10198]) Position ids shape: torch.Size([1, 10198]) Input IDs shape: torch.Size([1, 10198]) Labels shape: torch.Size([1, 10198]) Final batch size: 1, sequence length: 32559 Attention mask shape: torch.Size([1, 1, 32559, 32559]) Position ids shape: torch.Size([1, 32559]) Input IDs shape: torch.Size([1, 32559]) Labels shape: torch.Size([1, 32559]) Final batch size: 1, sequence length: 23936 Attention mask shape: torch.Size([1, 1, 23936, 23936]) Position ids shape: torch.Size([1, 23936]) Input IDs shape: torch.Size([1, 23936]) Labels shape: torch.Size([1, 23936]) Final batch size: 1, sequence length: 27819 Attention mask shape: torch.Size([1, 1, 27819, 27819]) Position ids shape: torch.Size([1, 27819]) Input IDs shape: torch.Size([1, 27819]) Labels shape: torch.Size([1, 27819]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 36638 Attention mask shape: torch.Size([1, 1, 36638, 36638]) Position ids shape: torch.Size([1, 36638]) Input IDs shape: torch.Size([1, 36638]) Labels shape: torch.Size([1, 36638]) Final batch size: 1, sequence length: 19494 Attention mask shape: torch.Size([1, 1, 19494, 19494]) Position ids shape: torch.Size([1, 19494]) Input IDs shape: torch.Size([1, 19494]) Labels shape: torch.Size([1, 19494]) Final batch size: 1, sequence length: 12605 Attention mask shape: torch.Size([1, 1, 12605, 12605]) Position ids shape: torch.Size([1, 12605]) Input IDs shape: torch.Size([1, 12605]) Labels shape: torch.Size([1, 12605]) Final batch size: 1, sequence length: 31316 Attention mask shape: torch.Size([1, 1, 31316, 31316]) Position ids shape: torch.Size([1, 31316]) Input IDs shape: torch.Size([1, 31316]) Labels shape: torch.Size([1, 31316]) Final batch size: 1, sequence length: 30835 Attention mask shape: torch.Size([1, 1, 30835, 30835]) Position ids shape: torch.Size([1, 30835]) Input IDs shape: torch.Size([1, 30835]) Labels shape: torch.Size([1, 30835]) Final batch size: 1, sequence length: 26269 Attention mask shape: torch.Size([1, 1, 26269, 26269]) Position ids shape: torch.Size([1, 26269]) Input IDs shape: torch.Size([1, 26269]) Labels shape: torch.Size([1, 26269]) Final batch size: 1, sequence length: 31754 Attention mask shape: torch.Size([1, 1, 31754, 31754]) Position ids shape: torch.Size([1, 31754]) Input IDs shape: torch.Size([1, 31754]) Labels shape: torch.Size([1, 31754]) Final batch size: 1, sequence length: 40763 Attention mask shape: torch.Size([1, 1, 40763, 40763]) Position ids shape: torch.Size([1, 40763]) Input IDs shape: torch.Size([1, 40763]) Labels shape: torch.Size([1, 40763]) Final batch size: 1, sequence length: 25573 Attention mask shape: torch.Size([1, 1, 25573, 25573]) Position ids shape: torch.Size([1, 25573]) Input IDs shape: torch.Size([1, 25573]) Labels shape: torch.Size([1, 25573]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 37170 Attention mask shape: torch.Size([1, 1, 37170, 37170]) Position ids shape: torch.Size([1, 37170]) Input IDs shape: torch.Size([1, 37170]) Labels shape: torch.Size([1, 37170]) {'loss': 0.2598, 'grad_norm': 0.1728384282783184, 'learning_rate': 3.320978675139919e-07, 'num_tokens': -inf, 'epoch': 7.25} Final batch size: 1, sequence length: 7998 Attention mask shape: torch.Size([1, 1, 7998, 7998]) Position ids shape: torch.Size([1, 7998]) Input IDs shape: torch.Size([1, 7998]) Labels shape: torch.Size([1, 7998]) Final batch size: 1, sequence length: 7402 Attention mask shape: torch.Size([1, 1, 7402, 7402]) Position ids shape: torch.Size([1, 7402]) Input IDs shape: torch.Size([1, 7402]) Labels shape: torch.Size([1, 7402]) Final batch size: 1, sequence length: 6925 Attention mask shape: torch.Size([1, 1, 6925, 6925]) Position ids shape: torch.Size([1, 6925]) Input IDs shape: torch.Size([1, 6925]) Labels shape: torch.Size([1, 6925]) Final batch size: 1, sequence length: 10102 Attention mask shape: torch.Size([1, 1, 10102, 10102]) Position ids shape: torch.Size([1, 10102]) Input IDs shape: torch.Size([1, 10102]) Labels shape: torch.Size([1, 10102]) Final batch size: 1, sequence length: 9452 Attention mask shape: torch.Size([1, 1, 9452, 9452]) Position ids shape: torch.Size([1, 9452]) Input IDs shape: torch.Size([1, 9452]) Labels shape: torch.Size([1, 9452]) Final batch size: 1, sequence length: 10862 Attention mask shape: torch.Size([1, 1, 10862, 10862]) Position ids shape: torch.Size([1, 10862]) Input IDs shape: torch.Size([1, 10862]) Labels shape: torch.Size([1, 10862]) Final batch size: 1, sequence length: 11947 Attention mask shape: torch.Size([1, 1, 11947, 11947]) Position ids shape: torch.Size([1, 11947]) Input IDs shape: torch.Size([1, 11947]) Labels shape: torch.Size([1, 11947]) Final batch size: 1, sequence length: 12719 Attention mask shape: torch.Size([1, 1, 12719, 12719]) Position ids shape: torch.Size([1, 12719]) Input IDs shape: torch.Size([1, 12719]) Labels shape: torch.Size([1, 12719]) Final batch size: 1, sequence length: 10804 Attention mask shape: torch.Size([1, 1, 10804, 10804]) Position ids shape: torch.Size([1, 10804]) Input IDs shape: torch.Size([1, 10804]) Labels shape: torch.Size([1, 10804]) Final batch size: 1, sequence length: 8432 Attention mask shape: torch.Size([1, 1, 8432, 8432]) Position ids shape: torch.Size([1, 8432]) Input IDs shape: torch.Size([1, 8432]) Labels shape: torch.Size([1, 8432]) Final batch size: 1, sequence length: 11623 Attention mask shape: torch.Size([1, 1, 11623, 11623]) Position ids shape: torch.Size([1, 11623]) Input IDs shape: torch.Size([1, 11623]) Labels shape: torch.Size([1, 11623]) Final batch size: 1, sequence length: 9243 Attention mask shape: torch.Size([1, 1, 9243, 9243]) Position ids shape: torch.Size([1, 9243]) Input IDs shape: torch.Size([1, 9243]) Labels shape: torch.Size([1, 9243]) Final batch size: 1, sequence length: 15794 Attention mask shape: torch.Size([1, 1, 15794, 15794]) Position ids shape: torch.Size([1, 15794]) Input IDs shape: torch.Size([1, 15794]) Labels shape: torch.Size([1, 15794]) Final batch size: 1, sequence length: 16060 Attention mask shape: torch.Size([1, 1, 16060, 16060]) Position ids shape: torch.Size([1, 16060]) Input IDs shape: torch.Size([1, 16060]) Labels shape: torch.Size([1, 16060]) Final batch size: 1, sequence length: 15053 Attention mask shape: torch.Size([1, 1, 15053, 15053]) Position ids shape: torch.Size([1, 15053]) Input IDs shape: torch.Size([1, 15053]) Labels shape: torch.Size([1, 15053]) Final batch size: 1, sequence length: 15886 Attention mask shape: torch.Size([1, 1, 15886, 15886]) Position ids shape: torch.Size([1, 15886]) Input IDs shape: torch.Size([1, 15886]) Labels shape: torch.Size([1, 15886]) Final batch size: 1, sequence length: 17376 Attention mask shape: torch.Size([1, 1, 17376, 17376]) Position ids shape: torch.Size([1, 17376]) Input IDs shape: torch.Size([1, 17376]) Labels shape: torch.Size([1, 17376]) Final batch size: 1, sequence length: 17918 Attention mask shape: torch.Size([1, 1, 17918, 17918]) Position ids shape: torch.Size([1, 17918]) Input IDs shape: torch.Size([1, 17918]) Labels shape: torch.Size([1, 17918]) Final batch size: 1, sequence length: 17370 Attention mask shape: torch.Size([1, 1, 17370, 17370]) Position ids shape: torch.Size([1, 17370]) Input IDs shape: torch.Size([1, 17370]) Labels shape: torch.Size([1, 17370]) Final batch size: 1, sequence length: 13209 Attention mask shape: torch.Size([1, 1, 13209, 13209]) Position ids shape: torch.Size([1, 13209]) Input IDs shape: torch.Size([1, 13209]) Labels shape: torch.Size([1, 13209]) Final batch size: 1, sequence length: 17767 Attention mask shape: torch.Size([1, 1, 17767, 17767]) Position ids shape: torch.Size([1, 17767]) Input IDs shape: torch.Size([1, 17767]) Labels shape: torch.Size([1, 17767]) Final batch size: 1, sequence length: 20185 Attention mask shape: torch.Size([1, 1, 20185, 20185]) Position ids shape: torch.Size([1, 20185]) Input IDs shape: torch.Size([1, 20185]) Labels shape: torch.Size([1, 20185]) Final batch size: 1, sequence length: 19609 Attention mask shape: torch.Size([1, 1, 19609, 19609]) Position ids shape: torch.Size([1, 19609]) Input IDs shape: torch.Size([1, 19609]) Labels shape: torch.Size([1, 19609]) Final batch size: 1, sequence length: 16536 Attention mask shape: torch.Size([1, 1, 16536, 16536]) Position ids shape: torch.Size([1, 16536]) Input IDs shape: torch.Size([1, 16536]) Labels shape: torch.Size([1, 16536]) Final batch size: 1, sequence length: 19629 Attention mask shape: torch.Size([1, 1, 19629, 19629]) Position ids shape: torch.Size([1, 19629]) Input IDs shape: torch.Size([1, 19629]) Labels shape: torch.Size([1, 19629]) Final batch size: 1, sequence length: 20554 Attention mask shape: torch.Size([1, 1, 20554, 20554]) Position ids shape: torch.Size([1, 20554]) Input IDs shape: torch.Size([1, 20554]) Labels shape: torch.Size([1, 20554]) Final batch size: 1, sequence length: 17665 Attention mask shape: torch.Size([1, 1, 17665, 17665]) Position ids shape: torch.Size([1, 17665]) Final batch size: 1, sequence length: 17625 Input IDs shape: torch.Size([1, 17665]) Attention mask shape: torch.Size([1, 1, 17625, 17625]) Position ids shape: torch.Size([1, 17625]) Input IDs shape: torch.Size([1, 17625]) Labels shape: torch.Size([1, 17625]) Labels shape: torch.Size([1, 17665]) Final batch size: 1, sequence length: 21117 Attention mask shape: torch.Size([1, 1, 21117, 21117]) Position ids shape: torch.Size([1, 21117]) Input IDs shape: torch.Size([1, 21117]) Labels shape: torch.Size([1, 21117]) Final batch size: 1, sequence length: 19935 Attention mask shape: torch.Size([1, 1, 19935, 19935]) Position ids shape: torch.Size([1, 19935]) Input IDs shape: torch.Size([1, 19935]) Labels shape: torch.Size([1, 19935]) Final batch size: 1, sequence length: 21421 Attention mask shape: torch.Size([1, 1, 21421, 21421]) Position ids shape: torch.Size([1, 21421]) Input IDs shape: torch.Size([1, 21421]) Labels shape: torch.Size([1, 21421]) Final batch size: 1, sequence length: 12006 Attention mask shape: torch.Size([1, 1, 12006, 12006]) Position ids shape: torch.Size([1, 12006]) Input IDs shape: torch.Size([1, 12006]) Labels shape: torch.Size([1, 12006]) Final batch size: 1, sequence length: 21771 Final batch size: 1, sequence length: 20056 Attention mask shape: torch.Size([1, 1, 21771, 21771]) Position ids shape: torch.Size([1, 21771]) Attention mask shape: torch.Size([1, 1, 20056, 20056]) Position ids shape: torch.Size([1, 20056]) Input IDs shape: torch.Size([1, 21771]) Labels shape: torch.Size([1, 21771]) Input IDs shape: torch.Size([1, 20056]) Labels shape: torch.Size([1, 20056]) Final batch size: 1, sequence length: 20524 Attention mask shape: torch.Size([1, 1, 20524, 20524]) Position ids shape: torch.Size([1, 20524]) Input IDs shape: torch.Size([1, 20524]) Labels shape: torch.Size([1, 20524]) Final batch size: 1, sequence length: 22797 Attention mask shape: torch.Size([1, 1, 22797, 22797]) Position ids shape: torch.Size([1, 22797]) Input IDs shape: torch.Size([1, 22797]) Labels shape: torch.Size([1, 22797]) Final batch size: 1, sequence length: 22208 Attention mask shape: torch.Size([1, 1, 22208, 22208]) Position ids shape: torch.Size([1, 22208]) Input IDs shape: torch.Size([1, 22208]) Labels shape: torch.Size([1, 22208]) Final batch size: 1, sequence length: 21980 Attention mask shape: torch.Size([1, 1, 21980, 21980]) Position ids shape: torch.Size([1, 21980]) Input IDs shape: torch.Size([1, 21980]) Labels shape: torch.Size([1, 21980]) Final batch size: 1, sequence length: 8498 Attention mask shape: torch.Size([1, 1, 8498, 8498]) Position ids shape: torch.Size([1, 8498]) Input IDs shape: torch.Size([1, 8498]) Labels shape: torch.Size([1, 8498]) Final batch size: 1, sequence length: 20816 Attention mask shape: torch.Size([1, 1, 20816, 20816]) Position ids shape: torch.Size([1, 20816]) Input IDs shape: torch.Size([1, 20816]) Labels shape: torch.Size([1, 20816]) Final batch size: 1, sequence length: 16439 Attention mask shape: torch.Size([1, 1, 16439, 16439]) Position ids shape: torch.Size([1, 16439]) Input IDs shape: torch.Size([1, 16439]) Labels shape: torch.Size([1, 16439]) Final batch size: 1, sequence length: 22683 Attention mask shape: torch.Size([1, 1, 22683, 22683]) Position ids shape: torch.Size([1, 22683]) Input IDs shape: torch.Size([1, 22683]) Labels shape: torch.Size([1, 22683]) Final batch size: 1, sequence length: 18991 Attention mask shape: torch.Size([1, 1, 18991, 18991]) Position ids shape: torch.Size([1, 18991]) Input IDs shape: torch.Size([1, 18991]) Labels shape: torch.Size([1, 18991]) Final batch size: 1, sequence length: 16141 Attention mask shape: torch.Size([1, 1, 16141, 16141]) Position ids shape: torch.Size([1, 16141]) Input IDs shape: torch.Size([1, 16141]) Labels shape: torch.Size([1, 16141]) Final batch size: 1, sequence length: 26121 Attention mask shape: torch.Size([1, 1, 26121, 26121]) Position ids shape: torch.Size([1, 26121]) Input IDs shape: torch.Size([1, 26121]) Labels shape: torch.Size([1, 26121]) Final batch size: 1, sequence length: 24308 Attention mask shape: torch.Size([1, 1, 24308, 24308]) Position ids shape: torch.Size([1, 24308]) Input IDs shape: torch.Size([1, 24308]) Labels shape: torch.Size([1, 24308]) Final batch size: 1, sequence length: 16564 Attention mask shape: torch.Size([1, 1, 16564, 16564]) Position ids shape: torch.Size([1, 16564]) Input IDs shape: torch.Size([1, 16564]) Labels shape: torch.Size([1, 16564]) Final batch size: 1, sequence length: 26142 Attention mask shape: torch.Size([1, 1, 26142, 26142]) Position ids shape: torch.Size([1, 26142]) Input IDs shape: torch.Size([1, 26142]) Labels shape: torch.Size([1, 26142]) Final batch size: 1, sequence length: 24515 Attention mask shape: torch.Size([1, 1, 24515, 24515]) Position ids shape: torch.Size([1, 24515]) Input IDs shape: torch.Size([1, 24515]) Labels shape: torch.Size([1, 24515]) Final batch size: 1, sequence length: 27179 Attention mask shape: torch.Size([1, 1, 27179, 27179]) Position ids shape: torch.Size([1, 27179]) Input IDs shape: torch.Size([1, 27179]) Labels shape: torch.Size([1, 27179]) Final batch size: 1, sequence length: 26449 Attention mask shape: torch.Size([1, 1, 26449, 26449]) Position ids shape: torch.Size([1, 26449]) Input IDs shape: torch.Size([1, 26449]) Labels shape: torch.Size([1, 26449]) Final batch size: 1, sequence length: 28164 Attention mask shape: torch.Size([1, 1, 28164, 28164]) Position ids shape: torch.Size([1, 28164]) Input IDs shape: torch.Size([1, 28164]) Labels shape: torch.Size([1, 28164]) Final batch size: 1, sequence length: 24433 Attention mask shape: torch.Size([1, 1, 24433, 24433]) Position ids shape: torch.Size([1, 24433]) Input IDs shape: torch.Size([1, 24433]) Labels shape: torch.Size([1, 24433]) Final batch size: 1, sequence length: 30197 Attention mask shape: torch.Size([1, 1, 30197, 30197]) Position ids shape: torch.Size([1, 30197]) Input IDs shape: torch.Size([1, 30197]) Labels shape: torch.Size([1, 30197]) Final batch size: 1, sequence length: 21152 Attention mask shape: torch.Size([1, 1, 21152, 21152]) Position ids shape: torch.Size([1, 21152]) Input IDs shape: torch.Size([1, 21152]) Labels shape: torch.Size([1, 21152]) Final batch size: 1, sequence length: 18122 Attention mask shape: torch.Size([1, 1, 18122, 18122]) Position ids shape: torch.Size([1, 18122]) Input IDs shape: torch.Size([1, 18122]) Labels shape: torch.Size([1, 18122]) Final batch size: 1, sequence length: 30181 Attention mask shape: torch.Size([1, 1, 30181, 30181]) Position ids shape: torch.Size([1, 30181]) Input IDs shape: torch.Size([1, 30181]) Labels shape: torch.Size([1, 30181]) Final batch size: 1, sequence length: 10857 Attention mask shape: torch.Size([1, 1, 10857, 10857]) Position ids shape: torch.Size([1, 10857]) Input IDs shape: torch.Size([1, 10857]) Labels shape: torch.Size([1, 10857]) Final batch size: 1, sequence length: 17363 Attention mask shape: torch.Size([1, 1, 17363, 17363]) Position ids shape: torch.Size([1, 17363]) Input IDs shape: torch.Size([1, 17363]) Labels shape: torch.Size([1, 17363]) Final batch size: 1, sequence length: 31339 Attention mask shape: torch.Size([1, 1, 31339, 31339]) Position ids shape: torch.Size([1, 31339]) Input IDs shape: torch.Size([1, 31339]) Labels shape: torch.Size([1, 31339]) Final batch size: 1, sequence length: 24802 Attention mask shape: torch.Size([1, 1, 24802, 24802]) Position ids shape: torch.Size([1, 24802]) Input IDs shape: torch.Size([1, 24802]) Labels shape: torch.Size([1, 24802]) Final batch size: 1, sequence length: 32466 Attention mask shape: torch.Size([1, 1, 32466, 32466]) Position ids shape: torch.Size([1, 32466]) Input IDs shape: torch.Size([1, 32466]) Labels shape: torch.Size([1, 32466]) Final batch size: 1, sequence length: 16837 Attention mask shape: torch.Size([1, 1, 16837, 16837]) Position ids shape: torch.Size([1, 16837]) Input IDs shape: torch.Size([1, 16837]) Labels shape: torch.Size([1, 16837]) Final batch size: 1, sequence length: 31507 Attention mask shape: torch.Size([1, 1, 31507, 31507]) Position ids shape: torch.Size([1, 31507]) Input IDs shape: torch.Size([1, 31507]) Labels shape: torch.Size([1, 31507]) Final batch size: 1, sequence length: 29628 Attention mask shape: torch.Size([1, 1, 29628, 29628]) Position ids shape: torch.Size([1, 29628]) Input IDs shape: torch.Size([1, 29628]) Labels shape: torch.Size([1, 29628]) Final batch size: 1, sequence length: 11107 Attention mask shape: torch.Size([1, 1, 11107, 11107]) Position ids shape: torch.Size([1, 11107]) Input IDs shape: torch.Size([1, 11107]) Labels shape: torch.Size([1, 11107]) Final batch size: 1, sequence length: 32752 Attention mask shape: torch.Size([1, 1, 32752, 32752]) Position ids shape: torch.Size([1, 32752]) Input IDs shape: torch.Size([1, 32752]) Labels shape: torch.Size([1, 32752]) Final batch size: 1, sequence length: 29392 Attention mask shape: torch.Size([1, 1, 29392, 29392]) Position ids shape: torch.Size([1, 29392]) Input IDs shape: torch.Size([1, 29392]) Labels shape: torch.Size([1, 29392]) Final batch size: 1, sequence length: 24527 Attention mask shape: torch.Size([1, 1, 24527, 24527]) Position ids shape: torch.Size([1, 24527]) Input IDs shape: torch.Size([1, 24527]) Labels shape: torch.Size([1, 24527]) Final batch size: 1, sequence length: 34711 Attention mask shape: torch.Size([1, 1, 34711, 34711]) Position ids shape: torch.Size([1, 34711]) Input IDs shape: torch.Size([1, 34711]) Labels shape: torch.Size([1, 34711]) Final batch size: 1, sequence length: 22762 Attention mask shape: torch.Size([1, 1, 22762, 22762]) Position ids shape: torch.Size([1, 22762]) Input IDs shape: torch.Size([1, 22762]) Labels shape: torch.Size([1, 22762]) Final batch size: 1, sequence length: 31525 Attention mask shape: torch.Size([1, 1, 31525, 31525]) Position ids shape: torch.Size([1, 31525]) Input IDs shape: torch.Size([1, 31525]) Labels shape: torch.Size([1, 31525]) Final batch size: 1, sequence length: 20274 Attention mask shape: torch.Size([1, 1, 20274, 20274]) Position ids shape: torch.Size([1, 20274]) Input IDs shape: torch.Size([1, 20274]) Labels shape: torch.Size([1, 20274]) Final batch size: 1, sequence length: 33894 Attention mask shape: torch.Size([1, 1, 33894, 33894]) Position ids shape: torch.Size([1, 33894]) Input IDs shape: torch.Size([1, 33894]) Labels shape: torch.Size([1, 33894]) Final batch size: 1, sequence length: 33087 Attention mask shape: torch.Size([1, 1, 33087, 33087]) Position ids shape: torch.Size([1, 33087]) Input IDs shape: torch.Size([1, 33087]) Labels shape: torch.Size([1, 33087]) Final batch size: 1, sequence length: 21143 Attention mask shape: torch.Size([1, 1, 21143, 21143]) Position ids shape: torch.Size([1, 21143]) Input IDs shape: torch.Size([1, 21143]) Labels shape: torch.Size([1, 21143]) Final batch size: 1, sequence length: 33367 Attention mask shape: torch.Size([1, 1, 33367, 33367]) Position ids shape: torch.Size([1, 33367]) Input IDs shape: torch.Size([1, 33367]) Labels shape: torch.Size([1, 33367]) Final batch size: 1, sequence length: 34512 Attention mask shape: torch.Size([1, 1, 34512, 34512]) Position ids shape: torch.Size([1, 34512]) Input IDs shape: torch.Size([1, 34512]) Labels shape: torch.Size([1, 34512]) Final batch size: 1, sequence length: 17181 Attention mask shape: torch.Size([1, 1, 17181, 17181]) Position ids shape: torch.Size([1, 17181]) Input IDs shape: torch.Size([1, 17181]) Labels shape: torch.Size([1, 17181]) Final batch size: 1, sequence length: 32891 Attention mask shape: torch.Size([1, 1, 32891, 32891]) Position ids shape: torch.Size([1, 32891]) Input IDs shape: torch.Size([1, 32891]) Labels shape: torch.Size([1, 32891]) Final batch size: 1, sequence length: 35523 Attention mask shape: torch.Size([1, 1, 35523, 35523]) Position ids shape: torch.Size([1, 35523]) Input IDs shape: torch.Size([1, 35523]) Labels shape: torch.Size([1, 35523]) Final batch size: 1, sequence length: 27061 Attention mask shape: torch.Size([1, 1, 27061, 27061]) Position ids shape: torch.Size([1, 27061]) Input IDs shape: torch.Size([1, 27061]) Labels shape: torch.Size([1, 27061]) Final batch size: 1, sequence length: 34405 Attention mask shape: torch.Size([1, 1, 34405, 34405]) Position ids shape: torch.Size([1, 34405]) Input IDs shape: torch.Size([1, 34405]) Labels shape: torch.Size([1, 34405]) Final batch size: 1, sequence length: 13942 Attention mask shape: torch.Size([1, 1, 13942, 13942]) Position ids shape: torch.Size([1, 13942]) Input IDs shape: torch.Size([1, 13942]) Labels shape: torch.Size([1, 13942]) Final batch size: 1, sequence length: 37555 Attention mask shape: torch.Size([1, 1, 37555, 37555]) Position ids shape: torch.Size([1, 37555]) Input IDs shape: torch.Size([1, 37555]) Labels shape: torch.Size([1, 37555]) Final batch size: 1, sequence length: 37343 Attention mask shape: torch.Size([1, 1, 37343, 37343]) Position ids shape: torch.Size([1, 37343]) Input IDs shape: torch.Size([1, 37343]) Labels shape: torch.Size([1, 37343]) Final batch size: 1, sequence length: 33794 Attention mask shape: torch.Size([1, 1, 33794, 33794]) Position ids shape: torch.Size([1, 33794]) Input IDs shape: torch.Size([1, 33794]) Labels shape: torch.Size([1, 33794]) Final batch size: 1, sequence length: 31518 Attention mask shape: torch.Size([1, 1, 31518, 31518]) Position ids shape: torch.Size([1, 31518]) Input IDs shape: torch.Size([1, 31518]) Labels shape: torch.Size([1, 31518]) Final batch size: 1, sequence length: 31875 Attention mask shape: torch.Size([1, 1, 31875, 31875]) Position ids shape: torch.Size([1, 31875]) Input IDs shape: torch.Size([1, 31875]) Labels shape: torch.Size([1, 31875]) Final batch size: 1, sequence length: 31561 Attention mask shape: torch.Size([1, 1, 31561, 31561]) Position ids shape: torch.Size([1, 31561]) Input IDs shape: torch.Size([1, 31561]) Labels shape: torch.Size([1, 31561]) Final batch size: 1, sequence length: 18229 Attention mask shape: torch.Size([1, 1, 18229, 18229]) Position ids shape: torch.Size([1, 18229]) Input IDs shape: torch.Size([1, 18229]) Labels shape: torch.Size([1, 18229]) Final batch size: 1, sequence length: 38010 Attention mask shape: torch.Size([1, 1, 38010, 38010]) Position ids shape: torch.Size([1, 38010]) Input IDs shape: torch.Size([1, 38010]) Labels shape: torch.Size([1, 38010]) Final batch size: 1, sequence length: 21859 Attention mask shape: torch.Size([1, 1, 21859, 21859]) Position ids shape: torch.Size([1, 21859]) Input IDs shape: torch.Size([1, 21859]) Labels shape: torch.Size([1, 21859]) Final batch size: 1, sequence length: 29216 Attention mask shape: torch.Size([1, 1, 29216, 29216]) Position ids shape: torch.Size([1, 29216]) Input IDs shape: torch.Size([1, 29216]) Labels shape: torch.Size([1, 29216]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 13978 Attention mask shape: torch.Size([1, 1, 13978, 13978]) Position ids shape: torch.Size([1, 13978]) Input IDs shape: torch.Size([1, 13978]) Labels shape: torch.Size([1, 13978]) Final batch size: 1, sequence length: 40910 Attention mask shape: torch.Size([1, 1, 40910, 40910]) Position ids shape: torch.Size([1, 40910]) Input IDs shape: torch.Size([1, 40910]) Labels shape: torch.Size([1, 40910]) Final batch size: 1, sequence length: 16716 Attention mask shape: torch.Size([1, 1, 16716, 16716]) Position ids shape: torch.Size([1, 16716]) Input IDs shape: torch.Size([1, 16716]) Labels shape: torch.Size([1, 16716]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 36450 Attention mask shape: torch.Size([1, 1, 36450, 36450]) Position ids shape: torch.Size([1, 36450]) Input IDs shape: torch.Size([1, 36450]) Labels shape: torch.Size([1, 36450]) Final batch size: 1, sequence length: 16433 Attention mask shape: torch.Size([1, 1, 16433, 16433]) Position ids shape: torch.Size([1, 16433]) Input IDs shape: torch.Size([1, 16433]) Labels shape: torch.Size([1, 16433]) Final batch size: 1, sequence length: 36464 Attention mask shape: torch.Size([1, 1, 36464, 36464]) Position ids shape: torch.Size([1, 36464]) Input IDs shape: torch.Size([1, 36464]) Labels shape: torch.Size([1, 36464]) Final batch size: 1, sequence length: 29042 Attention mask shape: torch.Size([1, 1, 29042, 29042]) Position ids shape: torch.Size([1, 29042]) Input IDs shape: torch.Size([1, 29042]) Labels shape: torch.Size([1, 29042]) Final batch size: 1, sequence length: 30134 Attention mask shape: torch.Size([1, 1, 30134, 30134]) Position ids shape: torch.Size([1, 30134]) Input IDs shape: torch.Size([1, 30134]) Labels shape: torch.Size([1, 30134]) Final batch size: 1, sequence length: 26922 Attention mask shape: torch.Size([1, 1, 26922, 26922]) Position ids shape: torch.Size([1, 26922]) Input IDs shape: torch.Size([1, 26922]) Labels shape: torch.Size([1, 26922]) Final batch size: 1, sequence length: 37349 Attention mask shape: torch.Size([1, 1, 37349, 37349]) Position ids shape: torch.Size([1, 37349]) Input IDs shape: torch.Size([1, 37349]) Labels shape: torch.Size([1, 37349]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 17758 Attention mask shape: torch.Size([1, 1, 17758, 17758]) Position ids shape: torch.Size([1, 17758]) Input IDs shape: torch.Size([1, 17758]) Labels shape: torch.Size([1, 17758]) Final batch size: 1, sequence length: 40752 Attention mask shape: torch.Size([1, 1, 40752, 40752]) Position ids shape: torch.Size([1, 40752]) Input IDs shape: torch.Size([1, 40752]) Labels shape: torch.Size([1, 40752]) Final batch size: 1, sequence length: 19287 Attention mask shape: torch.Size([1, 1, 19287, 19287]) Position ids shape: torch.Size([1, 19287]) Input IDs shape: torch.Size([1, 19287]) Labels shape: torch.Size([1, 19287]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 39169 Attention mask shape: torch.Size([1, 1, 39169, 39169]) Position ids shape: torch.Size([1, 39169]) Input IDs shape: torch.Size([1, 39169]) Labels shape: torch.Size([1, 39169]) Final batch size: 1, sequence length: 26893 Attention mask shape: torch.Size([1, 1, 26893, 26893]) Position ids shape: torch.Size([1, 26893]) Input IDs shape: torch.Size([1, 26893]) Labels shape: torch.Size([1, 26893]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 17535 Attention mask shape: torch.Size([1, 1, 17535, 17535]) Position ids shape: torch.Size([1, 17535]) Input IDs shape: torch.Size([1, 17535]) Labels shape: torch.Size([1, 17535]) Final batch size: 1, sequence length: 38529 Attention mask shape: torch.Size([1, 1, 38529, 38529]) Position ids shape: torch.Size([1, 38529]) Input IDs shape: torch.Size([1, 38529]) Labels shape: torch.Size([1, 38529]) Final batch size: 1, sequence length: 31449 Attention mask shape: torch.Size([1, 1, 31449, 31449]) Position ids shape: torch.Size([1, 31449]) Input IDs shape: torch.Size([1, 31449]) Labels shape: torch.Size([1, 31449]) Final batch size: 1, sequence length: 36599 Attention mask shape: torch.Size([1, 1, 36599, 36599]) Position ids shape: torch.Size([1, 36599]) Input IDs shape: torch.Size([1, 36599]) Labels shape: torch.Size([1, 36599]) Final batch size: 1, sequence length: 39258 Attention mask shape: torch.Size([1, 1, 39258, 39258]) Position ids shape: torch.Size([1, 39258]) Input IDs shape: torch.Size([1, 39258]) Labels shape: torch.Size([1, 39258]) Final batch size: 1, sequence length: 32638 Attention mask shape: torch.Size([1, 1, 32638, 32638]) Position ids shape: torch.Size([1, 32638]) Input IDs shape: torch.Size([1, 32638]) Labels shape: torch.Size([1, 32638]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 17890 Attention mask shape: torch.Size([1, 1, 17890, 17890]) Position ids shape: torch.Size([1, 17890]) Input IDs shape: torch.Size([1, 17890]) Labels shape: torch.Size([1, 17890]) Final batch size: 1, sequence length: 38891 Attention mask shape: torch.Size([1, 1, 38891, 38891]) Position ids shape: torch.Size([1, 38891]) Input IDs shape: torch.Size([1, 38891]) Labels shape: torch.Size([1, 38891]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 18471 Attention mask shape: torch.Size([1, 1, 18471, 18471]) Position ids shape: torch.Size([1, 18471]) Input IDs shape: torch.Size([1, 18471]) Labels shape: torch.Size([1, 18471]) Final batch size: 1, sequence length: 39953 Attention mask shape: torch.Size([1, 1, 39953, 39953]) Position ids shape: torch.Size([1, 39953]) Input IDs shape: torch.Size([1, 39953]) Labels shape: torch.Size([1, 39953]) Final batch size: 1, sequence length: 32465 Attention mask shape: torch.Size([1, 1, 32465, 32465]) Position ids shape: torch.Size([1, 32465]) Input IDs shape: torch.Size([1, 32465]) Labels shape: torch.Size([1, 32465]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) {'loss': 0.2487, 'grad_norm': 0.16983181329533026, 'learning_rate': 2.447174185242324e-07, 'num_tokens': -inf, 'epoch': 7.38} Final batch size: 1, sequence length: 5818 Attention mask shape: torch.Size([1, 1, 5818, 5818]) Position ids shape: torch.Size([1, 5818]) Input IDs shape: torch.Size([1, 5818]) Labels shape: torch.Size([1, 5818]) Final batch size: 1, sequence length: 6215 Attention mask shape: torch.Size([1, 1, 6215, 6215]) Position ids shape: torch.Size([1, 6215]) Input IDs shape: torch.Size([1, 6215]) Labels shape: torch.Size([1, 6215]) Final batch size: 1, sequence length: 7977 Attention mask shape: torch.Size([1, 1, 7977, 7977]) Position ids shape: torch.Size([1, 7977]) Input IDs shape: torch.Size([1, 7977]) Labels shape: torch.Size([1, 7977]) Final batch size: 1, sequence length: 8500 Attention mask shape: torch.Size([1, 1, 8500, 8500]) Position ids shape: torch.Size([1, 8500]) Input IDs shape: torch.Size([1, 8500]) Labels shape: torch.Size([1, 8500]) Final batch size: 1, sequence length: 6095 Attention mask shape: torch.Size([1, 1, 6095, 6095]) Position ids shape: torch.Size([1, 6095]) Input IDs shape: torch.Size([1, 6095]) Labels shape: torch.Size([1, 6095]) Final batch size: 1, sequence length: 10080 Attention mask shape: torch.Size([1, 1, 10080, 10080]) Position ids shape: torch.Size([1, 10080]) Input IDs shape: torch.Size([1, 10080]) Labels shape: torch.Size([1, 10080]) Final batch size: 1, sequence length: 12454 Attention mask shape: torch.Size([1, 1, 12454, 12454]) Position ids shape: torch.Size([1, 12454]) Input IDs shape: torch.Size([1, 12454]) Labels shape: torch.Size([1, 12454]) Final batch size: 1, sequence length: 12928 Attention mask shape: torch.Size([1, 1, 12928, 12928]) Position ids shape: torch.Size([1, 12928]) Input IDs shape: torch.Size([1, 12928]) Labels shape: torch.Size([1, 12928]) Final batch size: 1, sequence length: 10107 Attention mask shape: torch.Size([1, 1, 10107, 10107]) Position ids shape: torch.Size([1, 10107]) Input IDs shape: torch.Size([1, 10107]) Labels shape: torch.Size([1, 10107]) Final batch size: 1, sequence length: 12826 Attention mask shape: torch.Size([1, 1, 12826, 12826]) Position ids shape: torch.Size([1, 12826]) Input IDs shape: torch.Size([1, 12826]) Labels shape: torch.Size([1, 12826]) Final batch size: 1, sequence length: 10505 Attention mask shape: torch.Size([1, 1, 10505, 10505]) Position ids shape: torch.Size([1, 10505]) Input IDs shape: torch.Size([1, 10505]) Labels shape: torch.Size([1, 10505]) Final batch size: 1, sequence length: 9217 Attention mask shape: torch.Size([1, 1, 9217, 9217]) Position ids shape: torch.Size([1, 9217]) Input IDs shape: torch.Size([1, 9217]) Labels shape: torch.Size([1, 9217]) Final batch size: 1, sequence length: 13624 Attention mask shape: torch.Size([1, 1, 13624, 13624]) Position ids shape: torch.Size([1, 13624]) Input IDs shape: torch.Size([1, 13624]) Labels shape: torch.Size([1, 13624]) Final batch size: 1, sequence length: 13459 Attention mask shape: torch.Size([1, 1, 13459, 13459]) Position ids shape: torch.Size([1, 13459]) Input IDs shape: torch.Size([1, 13459]) Labels shape: torch.Size([1, 13459]) Final batch size: 1, sequence length: 9379 Attention mask shape: torch.Size([1, 1, 9379, 9379]) Position ids shape: torch.Size([1, 9379]) Input IDs shape: torch.Size([1, 9379]) Labels shape: torch.Size([1, 9379]) Final batch size: 1, sequence length: 13092 Attention mask shape: torch.Size([1, 1, 13092, 13092]) Position ids shape: torch.Size([1, 13092]) Input IDs shape: torch.Size([1, 13092]) Labels shape: torch.Size([1, 13092]) Final batch size: 1, sequence length: 16782 Attention mask shape: torch.Size([1, 1, 16782, 16782]) Position ids shape: torch.Size([1, 16782]) Input IDs shape: torch.Size([1, 16782]) Labels shape: torch.Size([1, 16782]) Final batch size: 1, sequence length: 12556 Attention mask shape: torch.Size([1, 1, 12556, 12556]) Position ids shape: torch.Size([1, 12556]) Input IDs shape: torch.Size([1, 12556]) Labels shape: torch.Size([1, 12556]) Final batch size: 1, sequence length: 10687 Attention mask shape: torch.Size([1, 1, 10687, 10687]) Position ids shape: torch.Size([1, 10687]) Input IDs shape: torch.Size([1, 10687]) Labels shape: torch.Size([1, 10687]) Final batch size: 1, sequence length: 10390 Attention mask shape: torch.Size([1, 1, 10390, 10390]) Position ids shape: torch.Size([1, 10390]) Input IDs shape: torch.Size([1, 10390]) Labels shape: torch.Size([1, 10390]) Final batch size: 1, sequence length: 17980 Attention mask shape: torch.Size([1, 1, 17980, 17980]) Position ids shape: torch.Size([1, 17980]) Input IDs shape: torch.Size([1, 17980]) Labels shape: torch.Size([1, 17980]) Final batch size: 1, sequence length: 10277 Attention mask shape: torch.Size([1, 1, 10277, 10277]) Position ids shape: torch.Size([1, 10277]) Input IDs shape: torch.Size([1, 10277]) Labels shape: torch.Size([1, 10277]) Final batch size: 1, sequence length: 18469 Attention mask shape: torch.Size([1, 1, 18469, 18469]) Position ids shape: torch.Size([1, 18469]) Input IDs shape: torch.Size([1, 18469]) Labels shape: torch.Size([1, 18469]) Final batch size: 1, sequence length: 10469 Attention mask shape: torch.Size([1, 1, 10469, 10469]) Position ids shape: torch.Size([1, 10469]) Input IDs shape: torch.Size([1, 10469]) Labels shape: torch.Size([1, 10469]) Final batch size: 1, sequence length: 18454 Attention mask shape: torch.Size([1, 1, 18454, 18454]) Position ids shape: torch.Size([1, 18454]) Input IDs shape: torch.Size([1, 18454]) Labels shape: torch.Size([1, 18454]) Final batch size: 1, sequence length: 19671 Attention mask shape: torch.Size([1, 1, 19671, 19671]) Position ids shape: torch.Size([1, 19671]) Input IDs shape: torch.Size([1, 19671]) Labels shape: torch.Size([1, 19671]) Final batch size: 1, sequence length: 15185 Attention mask shape: torch.Size([1, 1, 15185, 15185]) Position ids shape: torch.Size([1, 15185]) Input IDs shape: torch.Size([1, 15185]) Labels shape: torch.Size([1, 15185]) Final batch size: 1, sequence length: 19683 Attention mask shape: torch.Size([1, 1, 19683, 19683]) Position ids shape: torch.Size([1, 19683]) Input IDs shape: torch.Size([1, 19683]) Labels shape: torch.Size([1, 19683]) Final batch size: 1, sequence length: 20243 Attention mask shape: torch.Size([1, 1, 20243, 20243]) Position ids shape: torch.Size([1, 20243]) Input IDs shape: torch.Size([1, 20243]) Labels shape: torch.Size([1, 20243]) Final batch size: 1, sequence length: 20191 Attention mask shape: torch.Size([1, 1, 20191, 20191]) Position ids shape: torch.Size([1, 20191]) Input IDs shape: torch.Size([1, 20191]) Labels shape: torch.Size([1, 20191]) Final batch size: 1, sequence length: 15263 Attention mask shape: torch.Size([1, 1, 15263, 15263]) Position ids shape: torch.Size([1, 15263]) Input IDs shape: torch.Size([1, 15263]) Labels shape: torch.Size([1, 15263]) Final batch size: 1, sequence length: 21051 Attention mask shape: torch.Size([1, 1, 21051, 21051]) Position ids shape: torch.Size([1, 21051]) Input IDs shape: torch.Size([1, 21051]) Labels shape: torch.Size([1, 21051]) Final batch size: 1, sequence length: 14892 Attention mask shape: torch.Size([1, 1, 14892, 14892]) Position ids shape: torch.Size([1, 14892]) Input IDs shape: torch.Size([1, 14892]) Labels shape: torch.Size([1, 14892]) Final batch size: 1, sequence length: 16585 Attention mask shape: torch.Size([1, 1, 16585, 16585]) Position ids shape: torch.Size([1, 16585]) Input IDs shape: torch.Size([1, 16585]) Labels shape: torch.Size([1, 16585]) Final batch size: 1, sequence length: 17417 Attention mask shape: torch.Size([1, 1, 17417, 17417]) Position ids shape: torch.Size([1, 17417]) Input IDs shape: torch.Size([1, 17417]) Labels shape: torch.Size([1, 17417]) Final batch size: 1, sequence length: 10182 Attention mask shape: torch.Size([1, 1, 10182, 10182]) Position ids shape: torch.Size([1, 10182]) Input IDs shape: torch.Size([1, 10182]) Labels shape: torch.Size([1, 10182]) Final batch size: 1, sequence length: 12012 Attention mask shape: torch.Size([1, 1, 12012, 12012]) Position ids shape: torch.Size([1, 12012]) Input IDs shape: torch.Size([1, 12012]) Labels shape: torch.Size([1, 12012]) Final batch size: 1, sequence length: 22133 Attention mask shape: torch.Size([1, 1, 22133, 22133]) Position ids shape: torch.Size([1, 22133]) Input IDs shape: torch.Size([1, 22133]) Labels shape: torch.Size([1, 22133]) Final batch size: 1, sequence length: 20562 Attention mask shape: torch.Size([1, 1, 20562, 20562]) Position ids shape: torch.Size([1, 20562]) Input IDs shape: torch.Size([1, 20562]) Labels shape: torch.Size([1, 20562]) Final batch size: 1, sequence length: 20432 Attention mask shape: torch.Size([1, 1, 20432, 20432]) Position ids shape: torch.Size([1, 20432]) Input IDs shape: torch.Size([1, 20432]) Labels shape: torch.Size([1, 20432]) Final batch size: 1, sequence length: 24255 Attention mask shape: torch.Size([1, 1, 24255, 24255]) Position ids shape: torch.Size([1, 24255]) Input IDs shape: torch.Size([1, 24255]) Labels shape: torch.Size([1, 24255]) Final batch size: 1, sequence length: 14556 Attention mask shape: torch.Size([1, 1, 14556, 14556]) Position ids shape: torch.Size([1, 14556]) Input IDs shape: torch.Size([1, 14556]) Labels shape: torch.Size([1, 14556]) Final batch size: 1, sequence length: 21664 Attention mask shape: torch.Size([1, 1, 21664, 21664]) Position ids shape: torch.Size([1, 21664]) Input IDs shape: torch.Size([1, 21664]) Labels shape: torch.Size([1, 21664]) Final batch size: 1, sequence length: 23238 Attention mask shape: torch.Size([1, 1, 23238, 23238]) Position ids shape: torch.Size([1, 23238]) Input IDs shape: torch.Size([1, 23238]) Labels shape: torch.Size([1, 23238]) Final batch size: 1, sequence length: 15339 Attention mask shape: torch.Size([1, 1, 15339, 15339]) Position ids shape: torch.Size([1, 15339]) Input IDs shape: torch.Size([1, 15339]) Labels shape: torch.Size([1, 15339]) Final batch size: 1, sequence length: 24002 Attention mask shape: torch.Size([1, 1, 24002, 24002]) Position ids shape: torch.Size([1, 24002]) Input IDs shape: torch.Size([1, 24002]) Labels shape: torch.Size([1, 24002]) Final batch size: 1, sequence length: 24769 Attention mask shape: torch.Size([1, 1, 24769, 24769]) Position ids shape: torch.Size([1, 24769]) Input IDs shape: torch.Size([1, 24769]) Labels shape: torch.Size([1, 24769]) Final batch size: 1, sequence length: 19187 Attention mask shape: torch.Size([1, 1, 19187, 19187]) Position ids shape: torch.Size([1, 19187]) Input IDs shape: torch.Size([1, 19187]) Labels shape: torch.Size([1, 19187]) Final batch size: 1, sequence length: 26144 Attention mask shape: torch.Size([1, 1, 26144, 26144]) Position ids shape: torch.Size([1, 26144]) Input IDs shape: torch.Size([1, 26144]) Labels shape: torch.Size([1, 26144]) Final batch size: 1, sequence length: 24428 Attention mask shape: torch.Size([1, 1, 24428, 24428]) Position ids shape: torch.Size([1, 24428]) Input IDs shape: torch.Size([1, 24428]) Labels shape: torch.Size([1, 24428]) Final batch size: 1, sequence length: 25707 Attention mask shape: torch.Size([1, 1, 25707, 25707]) Position ids shape: torch.Size([1, 25707]) Input IDs shape: torch.Size([1, 25707]) Labels shape: torch.Size([1, 25707]) Final batch size: 1, sequence length: 22857 Attention mask shape: torch.Size([1, 1, 22857, 22857]) Position ids shape: torch.Size([1, 22857]) Input IDs shape: torch.Size([1, 22857]) Labels shape: torch.Size([1, 22857]) Final batch size: 1, sequence length: 25656 Attention mask shape: torch.Size([1, 1, 25656, 25656]) Position ids shape: torch.Size([1, 25656]) Input IDs shape: torch.Size([1, 25656]) Labels shape: torch.Size([1, 25656]) Final batch size: 1, sequence length: 24858 Attention mask shape: torch.Size([1, 1, 24858, 24858]) Position ids shape: torch.Size([1, 24858]) Input IDs shape: torch.Size([1, 24858]) Labels shape: torch.Size([1, 24858]) Final batch size: 1, sequence length: 16541 Attention mask shape: torch.Size([1, 1, 16541, 16541]) Position ids shape: torch.Size([1, 16541]) Input IDs shape: torch.Size([1, 16541]) Labels shape: torch.Size([1, 16541]) Final batch size: 1, sequence length: 26312 Attention mask shape: torch.Size([1, 1, 26312, 26312]) Position ids shape: torch.Size([1, 26312]) Input IDs shape: torch.Size([1, 26312]) Labels shape: torch.Size([1, 26312]) Final batch size: 1, sequence length: 25758 Attention mask shape: torch.Size([1, 1, 25758, 25758]) Position ids shape: torch.Size([1, 25758]) Input IDs shape: torch.Size([1, 25758]) Labels shape: torch.Size([1, 25758]) Final batch size: 1, sequence length: 23602 Attention mask shape: torch.Size([1, 1, 23602, 23602]) Position ids shape: torch.Size([1, 23602]) Input IDs shape: torch.Size([1, 23602]) Labels shape: torch.Size([1, 23602]) Final batch size: 1, sequence length: 26033 Attention mask shape: torch.Size([1, 1, 26033, 26033]) Position ids shape: torch.Size([1, 26033]) Input IDs shape: torch.Size([1, 26033]) Labels shape: torch.Size([1, 26033]) Final batch size: 1, sequence length: 25886 Attention mask shape: torch.Size([1, 1, 25886, 25886]) Position ids shape: torch.Size([1, 25886]) Input IDs shape: torch.Size([1, 25886]) Labels shape: torch.Size([1, 25886]) Final batch size: 1, sequence length: 10318 Attention mask shape: torch.Size([1, 1, 10318, 10318]) Position ids shape: torch.Size([1, 10318]) Input IDs shape: torch.Size([1, 10318]) Labels shape: torch.Size([1, 10318]) Final batch size: 1, sequence length: 23698 Attention mask shape: torch.Size([1, 1, 23698, 23698]) Position ids shape: torch.Size([1, 23698]) Input IDs shape: torch.Size([1, 23698]) Labels shape: torch.Size([1, 23698]) Final batch size: 1, sequence length: 29948 Attention mask shape: torch.Size([1, 1, 29948, 29948]) Position ids shape: torch.Size([1, 29948]) Input IDs shape: torch.Size([1, 29948]) Labels shape: torch.Size([1, 29948]) Final batch size: 1, sequence length: 30079 Attention mask shape: torch.Size([1, 1, 30079, 30079]) Position ids shape: torch.Size([1, 30079]) Input IDs shape: torch.Size([1, 30079]) Labels shape: torch.Size([1, 30079]) Final batch size: 1, sequence length: 15294 Attention mask shape: torch.Size([1, 1, 15294, 15294]) Position ids shape: torch.Size([1, 15294]) Input IDs shape: torch.Size([1, 15294]) Labels shape: torch.Size([1, 15294]) Final batch size: 1, sequence length: 29825 Attention mask shape: torch.Size([1, 1, 29825, 29825]) Position ids shape: torch.Size([1, 29825]) Input IDs shape: torch.Size([1, 29825]) Labels shape: torch.Size([1, 29825]) Final batch size: 1, sequence length: 32541 Attention mask shape: torch.Size([1, 1, 32541, 32541]) Position ids shape: torch.Size([1, 32541]) Input IDs shape: torch.Size([1, 32541]) Labels shape: torch.Size([1, 32541]) Final batch size: 1, sequence length: 29749 Attention mask shape: torch.Size([1, 1, 29749, 29749]) Position ids shape: torch.Size([1, 29749]) Input IDs shape: torch.Size([1, 29749]) Labels shape: torch.Size([1, 29749]) Final batch size: 1, sequence length: 21374 Attention mask shape: torch.Size([1, 1, 21374, 21374]) Position ids shape: torch.Size([1, 21374]) Input IDs shape: torch.Size([1, 21374]) Labels shape: torch.Size([1, 21374]) Final batch size: 1, sequence length: 15604 Attention mask shape: torch.Size([1, 1, 15604, 15604]) Position ids shape: torch.Size([1, 15604]) Input IDs shape: torch.Size([1, 15604]) Labels shape: torch.Size([1, 15604]) Final batch size: 1, sequence length: 25719 Attention mask shape: torch.Size([1, 1, 25719, 25719]) Position ids shape: torch.Size([1, 25719]) Input IDs shape: torch.Size([1, 25719]) Labels shape: torch.Size([1, 25719]) Final batch size: 1, sequence length: 32071 Attention mask shape: torch.Size([1, 1, 32071, 32071]) Position ids shape: torch.Size([1, 32071]) Input IDs shape: torch.Size([1, 32071]) Labels shape: torch.Size([1, 32071]) Final batch size: 1, sequence length: 22139 Attention mask shape: torch.Size([1, 1, 22139, 22139]) Position ids shape: torch.Size([1, 22139]) Input IDs shape: torch.Size([1, 22139]) Labels shape: torch.Size([1, 22139]) Final batch size: 1, sequence length: 35760 Attention mask shape: torch.Size([1, 1, 35760, 35760]) Position ids shape: torch.Size([1, 35760]) Input IDs shape: torch.Size([1, 35760]) Labels shape: torch.Size([1, 35760]) Final batch size: 1, sequence length: 33611 Attention mask shape: torch.Size([1, 1, 33611, 33611]) Position ids shape: torch.Size([1, 33611]) Input IDs shape: torch.Size([1, 33611]) Labels shape: torch.Size([1, 33611]) Final batch size: 1, sequence length: 25492 Attention mask shape: torch.Size([1, 1, 25492, 25492]) Position ids shape: torch.Size([1, 25492]) Input IDs shape: torch.Size([1, 25492]) Labels shape: torch.Size([1, 25492]) Final batch size: 1, sequence length: 23399 Attention mask shape: torch.Size([1, 1, 23399, 23399]) Position ids shape: torch.Size([1, 23399]) Input IDs shape: torch.Size([1, 23399]) Labels shape: torch.Size([1, 23399]) Final batch size: 1, sequence length: 36456 Attention mask shape: torch.Size([1, 1, 36456, 36456]) Position ids shape: torch.Size([1, 36456]) Input IDs shape: torch.Size([1, 36456]) Labels shape: torch.Size([1, 36456]) Final batch size: 1, sequence length: 9897 Attention mask shape: torch.Size([1, 1, 9897, 9897]) Position ids shape: torch.Size([1, 9897]) Input IDs shape: torch.Size([1, 9897]) Labels shape: torch.Size([1, 9897]) Final batch size: 1, sequence length: 34186 Attention mask shape: torch.Size([1, 1, 34186, 34186]) Position ids shape: torch.Size([1, 34186]) Input IDs shape: torch.Size([1, 34186]) Labels shape: torch.Size([1, 34186]) Final batch size: 1, sequence length: 37241 Attention mask shape: torch.Size([1, 1, 37241, 37241]) Position ids shape: torch.Size([1, 37241]) Input IDs shape: torch.Size([1, 37241]) Labels shape: torch.Size([1, 37241]) Final batch size: 1, sequence length: 19238 Attention mask shape: torch.Size([1, 1, 19238, 19238]) Position ids shape: torch.Size([1, 19238]) Input IDs shape: torch.Size([1, 19238]) Labels shape: torch.Size([1, 19238]) Final batch size: 1, sequence length: 34701 Attention mask shape: torch.Size([1, 1, 34701, 34701]) Position ids shape: torch.Size([1, 34701]) Input IDs shape: torch.Size([1, 34701]) Labels shape: torch.Size([1, 34701]) Final batch size: 1, sequence length: 37728 Attention mask shape: torch.Size([1, 1, 37728, 37728]) Position ids shape: torch.Size([1, 37728]) Input IDs shape: torch.Size([1, 37728]) Labels shape: torch.Size([1, 37728]) Final batch size: 1, sequence length: 36778 Attention mask shape: torch.Size([1, 1, 36778, 36778]) Position ids shape: torch.Size([1, 36778]) Input IDs shape: torch.Size([1, 36778]) Labels shape: torch.Size([1, 36778]) Final batch size: 1, sequence length: 21557 Attention mask shape: torch.Size([1, 1, 21557, 21557]) Position ids shape: torch.Size([1, 21557]) Input IDs shape: torch.Size([1, 21557]) Labels shape: torch.Size([1, 21557]) Final batch size: 1, sequence length: 33459 Attention mask shape: torch.Size([1, 1, 33459, 33459]) Position ids shape: torch.Size([1, 33459]) Input IDs shape: torch.Size([1, 33459]) Labels shape: torch.Size([1, 33459]) Final batch size: 1, sequence length: 40147 Attention mask shape: torch.Size([1, 1, 40147, 40147]) Position ids shape: torch.Size([1, 40147]) Input IDs shape: torch.Size([1, 40147]) Labels shape: torch.Size([1, 40147]) Final batch size: 1, sequence length: 40593 Attention mask shape: torch.Size([1, 1, 40593, 40593]) Position ids shape: torch.Size([1, 40593]) Input IDs shape: torch.Size([1, 40593]) Labels shape: torch.Size([1, 40593]) Final batch size: 1, sequence length: 7681 Attention mask shape: torch.Size([1, 1, 7681, 7681]) Position ids shape: torch.Size([1, 7681]) Input IDs shape: torch.Size([1, 7681]) Labels shape: torch.Size([1, 7681]) Final batch size: 1, sequence length: 23343 Attention mask shape: torch.Size([1, 1, 23343, 23343]) Position ids shape: torch.Size([1, 23343]) Input IDs shape: torch.Size([1, 23343]) Labels shape: torch.Size([1, 23343]) Final batch size: 1, sequence length: 31650 Attention mask shape: torch.Size([1, 1, 31650, 31650]) Position ids shape: torch.Size([1, 31650]) Input IDs shape: torch.Size([1, 31650]) Labels shape: torch.Size([1, 31650]) Final batch size: 1, sequence length: 9445 Attention mask shape: torch.Size([1, 1, 9445, 9445]) Position ids shape: torch.Size([1, 9445]) Input IDs shape: torch.Size([1, 9445]) Labels shape: torch.Size([1, 9445]) Final batch size: 1, sequence length: 31323 Attention mask shape: torch.Size([1, 1, 31323, 31323]) Position ids shape: torch.Size([1, 31323]) Input IDs shape: torch.Size([1, 31323]) Labels shape: torch.Size([1, 31323]) Final batch size: 1, sequence length: 21955 Attention mask shape: torch.Size([1, 1, 21955, 21955]) Position ids shape: torch.Size([1, 21955]) Input IDs shape: torch.Size([1, 21955]) Labels shape: torch.Size([1, 21955]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 39760 Attention mask shape: torch.Size([1, 1, 39760, 39760]) Position ids shape: torch.Size([1, 39760]) Input IDs shape: torch.Size([1, 39760]) Labels shape: torch.Size([1, 39760]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 22491 Attention mask shape: torch.Size([1, 1, 22491, 22491]) Position ids shape: torch.Size([1, 22491]) Input IDs shape: torch.Size([1, 22491]) Labels shape: torch.Size([1, 22491]) Final batch size: 1, sequence length: 27283 Attention mask shape: torch.Size([1, 1, 27283, 27283]) Position ids shape: torch.Size([1, 27283]) Input IDs shape: torch.Size([1, 27283]) Labels shape: torch.Size([1, 27283]) Final batch size: 1, sequence length: 19867 Attention mask shape: torch.Size([1, 1, 19867, 19867]) Position ids shape: torch.Size([1, 19867]) Input IDs shape: torch.Size([1, 19867]) Labels shape: torch.Size([1, 19867]) Final batch size: 1, sequence length: 22434 Attention mask shape: torch.Size([1, 1, 22434, 22434]) Position ids shape: torch.Size([1, 22434]) Input IDs shape: torch.Size([1, 22434]) Labels shape: torch.Size([1, 22434]) Final batch size: 1, sequence length: 40507 Attention mask shape: torch.Size([1, 1, 40507, 40507]) Position ids shape: torch.Size([1, 40507]) Input IDs shape: torch.Size([1, 40507]) Labels shape: torch.Size([1, 40507]) Final batch size: 1, sequence length: 36579 Attention mask shape: torch.Size([1, 1, 36579, 36579]) Position ids shape: torch.Size([1, 36579]) Input IDs shape: torch.Size([1, 36579]) Labels shape: torch.Size([1, 36579]) Final batch size: 1, sequence length: 37183 Attention mask shape: torch.Size([1, 1, 37183, 37183]) Position ids shape: torch.Size([1, 37183]) Input IDs shape: torch.Size([1, 37183]) Labels shape: torch.Size([1, 37183]) Final batch size: 1, sequence length: 26664 Attention mask shape: torch.Size([1, 1, 26664, 26664]) Position ids shape: torch.Size([1, 26664]) Input IDs shape: torch.Size([1, 26664]) Labels shape: torch.Size([1, 26664]) Final batch size: 1, sequence length: 17646 Attention mask shape: torch.Size([1, 1, 17646, 17646]) Position ids shape: torch.Size([1, 17646]) Input IDs shape: torch.Size([1, 17646]) Labels shape: torch.Size([1, 17646]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 21489 Attention mask shape: torch.Size([1, 1, 21489, 21489]) Position ids shape: torch.Size([1, 21489]) Input IDs shape: torch.Size([1, 21489]) Labels shape: torch.Size([1, 21489]) Final batch size: 1, sequence length: 29082 Attention mask shape: torch.Size([1, 1, 29082, 29082]) Position ids shape: torch.Size([1, 29082]) Input IDs shape: torch.Size([1, 29082]) Labels shape: torch.Size([1, 29082]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 37775 Attention mask shape: torch.Size([1, 1, 37775, 37775]) Position ids shape: torch.Size([1, 37775]) Input IDs shape: torch.Size([1, 37775]) Labels shape: torch.Size([1, 37775]) Final batch size: 1, sequence length: 23014 Attention mask shape: torch.Size([1, 1, 23014, 23014]) Position ids shape: torch.Size([1, 23014]) Input IDs shape: torch.Size([1, 23014]) Labels shape: torch.Size([1, 23014]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 28661 Attention mask shape: torch.Size([1, 1, 28661, 28661]) Position ids shape: torch.Size([1, 28661]) Input IDs shape: torch.Size([1, 28661]) Labels shape: torch.Size([1, 28661]) Final batch size: 1, sequence length: 16273 Attention mask shape: torch.Size([1, 1, 16273, 16273]) Position ids shape: torch.Size([1, 16273]) Input IDs shape: torch.Size([1, 16273]) Labels shape: torch.Size([1, 16273]) Final batch size: 1, sequence length: 38686 Attention mask shape: torch.Size([1, 1, 38686, 38686]) Position ids shape: torch.Size([1, 38686]) Input IDs shape: torch.Size([1, 38686]) Labels shape: torch.Size([1, 38686]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 37091 Attention mask shape: torch.Size([1, 1, 37091, 37091]) Position ids shape: torch.Size([1, 37091]) Input IDs shape: torch.Size([1, 37091]) Labels shape: torch.Size([1, 37091]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 33263 Attention mask shape: torch.Size([1, 1, 33263, 33263]) Position ids shape: torch.Size([1, 33263]) Input IDs shape: torch.Size([1, 33263]) Labels shape: torch.Size([1, 33263]) Final batch size: 1, sequence length: 25914 Attention mask shape: torch.Size([1, 1, 25914, 25914]) Position ids shape: torch.Size([1, 25914]) Input IDs shape: torch.Size([1, 25914]) Labels shape: torch.Size([1, 25914]) Final batch size: 1, sequence length: 30364 Attention mask shape: torch.Size([1, 1, 30364, 30364]) Position ids shape: torch.Size([1, 30364]) Input IDs shape: torch.Size([1, 30364]) Labels shape: torch.Size([1, 30364]) Final batch size: 1, sequence length: 36297 Attention mask shape: torch.Size([1, 1, 36297, 36297]) Position ids shape: torch.Size([1, 36297]) Input IDs shape: torch.Size([1, 36297]) Labels shape: torch.Size([1, 36297]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) {'loss': 0.2519, 'grad_norm': 0.16317069106173507, 'learning_rate': 1.7037086855465902e-07, 'num_tokens': -inf, 'epoch': 7.5} Final batch size: 1, sequence length: 6519 Attention mask shape: torch.Size([1, 1, 6519, 6519]) Position ids shape: torch.Size([1, 6519]) Input IDs shape: torch.Size([1, 6519]) Labels shape: torch.Size([1, 6519]) Final batch size: 1, sequence length: 5525 Attention mask shape: torch.Size([1, 1, 5525, 5525]) Position ids shape: torch.Size([1, 5525]) Input IDs shape: torch.Size([1, 5525]) Labels shape: torch.Size([1, 5525]) Final batch size: 1, sequence length: 10273 Attention mask shape: torch.Size([1, 1, 10273, 10273]) Position ids shape: torch.Size([1, 10273]) Input IDs shape: torch.Size([1, 10273]) Labels shape: torch.Size([1, 10273]) Final batch size: 1, sequence length: 9181 Attention mask shape: torch.Size([1, 1, 9181, 9181]) Position ids shape: torch.Size([1, 9181]) Input IDs shape: torch.Size([1, 9181]) Labels shape: torch.Size([1, 9181]) Final batch size: 1, sequence length: 10408 Attention mask shape: torch.Size([1, 1, 10408, 10408]) Position ids shape: torch.Size([1, 10408]) Input IDs shape: torch.Size([1, 10408]) Labels shape: torch.Size([1, 10408]) Final batch size: 1, sequence length: 12281 Attention mask shape: torch.Size([1, 1, 12281, 12281]) Position ids shape: torch.Size([1, 12281]) Input IDs shape: torch.Size([1, 12281]) Labels shape: torch.Size([1, 12281]) Final batch size: 1, sequence length: 12927 Attention mask shape: torch.Size([1, 1, 12927, 12927]) Position ids shape: torch.Size([1, 12927]) Input IDs shape: torch.Size([1, 12927]) Labels shape: torch.Size([1, 12927]) Final batch size: 1, sequence length: 13385 Attention mask shape: torch.Size([1, 1, 13385, 13385]) Position ids shape: torch.Size([1, 13385]) Input IDs shape: torch.Size([1, 13385]) Labels shape: torch.Size([1, 13385]) Final batch size: 1, sequence length: 15257 Attention mask shape: torch.Size([1, 1, 15257, 15257]) Position ids shape: torch.Size([1, 15257]) Input IDs shape: torch.Size([1, 15257]) Labels shape: torch.Size([1, 15257]) Final batch size: 1, sequence length: 13804 Attention mask shape: torch.Size([1, 1, 13804, 13804]) Position ids shape: torch.Size([1, 13804]) Input IDs shape: torch.Size([1, 13804]) Labels shape: torch.Size([1, 13804]) Final batch size: 1, sequence length: 13363 Attention mask shape: torch.Size([1, 1, 13363, 13363]) Position ids shape: torch.Size([1, 13363]) Input IDs shape: torch.Size([1, 13363]) Labels shape: torch.Size([1, 13363]) Final batch size: 1, sequence length: 15518 Attention mask shape: torch.Size([1, 1, 15518, 15518]) Position ids shape: torch.Size([1, 15518]) Input IDs shape: torch.Size([1, 15518]) Labels shape: torch.Size([1, 15518]) Final batch size: 1, sequence length: 10905 Attention mask shape: torch.Size([1, 1, 10905, 10905]) Position ids shape: torch.Size([1, 10905]) Input IDs shape: torch.Size([1, 10905]) Labels shape: torch.Size([1, 10905]) Final batch size: 1, sequence length: 17003 Attention mask shape: torch.Size([1, 1, 17003, 17003]) Position ids shape: torch.Size([1, 17003]) Input IDs shape: torch.Size([1, 17003]) Labels shape: torch.Size([1, 17003]) Final batch size: 1, sequence length: 16520 Attention mask shape: torch.Size([1, 1, 16520, 16520]) Position ids shape: torch.Size([1, 16520]) Input IDs shape: torch.Size([1, 16520]) Labels shape: torch.Size([1, 16520]) Final batch size: 1, sequence length: 19768 Attention mask shape: torch.Size([1, 1, 19768, 19768]) Position ids shape: torch.Size([1, 19768]) Input IDs shape: torch.Size([1, 19768]) Labels shape: torch.Size([1, 19768]) Final batch size: 1, sequence length: 8646 Attention mask shape: torch.Size([1, 1, 8646, 8646]) Position ids shape: torch.Size([1, 8646]) Input IDs shape: torch.Size([1, 8646]) Labels shape: torch.Size([1, 8646]) Final batch size: 1, sequence length: 15244 Attention mask shape: torch.Size([1, 1, 15244, 15244]) Position ids shape: torch.Size([1, 15244]) Input IDs shape: torch.Size([1, 15244]) Labels shape: torch.Size([1, 15244]) Final batch size: 1, sequence length: 20089 Attention mask shape: torch.Size([1, 1, 20089, 20089]) Position ids shape: torch.Size([1, 20089]) Input IDs shape: torch.Size([1, 20089]) Labels shape: torch.Size([1, 20089]) Final batch size: 1, sequence length: 18645 Attention mask shape: torch.Size([1, 1, 18645, 18645]) Position ids shape: torch.Size([1, 18645]) Input IDs shape: torch.Size([1, 18645]) Labels shape: torch.Size([1, 18645]) Final batch size: 1, sequence length: 20979 Attention mask shape: torch.Size([1, 1, 20979, 20979]) Position ids shape: torch.Size([1, 20979]) Input IDs shape: torch.Size([1, 20979]) Labels shape: torch.Size([1, 20979]) Final batch size: 1, sequence length: 14429 Attention mask shape: torch.Size([1, 1, 14429, 14429]) Position ids shape: torch.Size([1, 14429]) Input IDs shape: torch.Size([1, 14429]) Labels shape: torch.Size([1, 14429]) Final batch size: 1, sequence length: 19513 Attention mask shape: torch.Size([1, 1, 19513, 19513]) Position ids shape: torch.Size([1, 19513]) Input IDs shape: torch.Size([1, 19513]) Labels shape: torch.Size([1, 19513]) Final batch size: 1, sequence length: 22311 Attention mask shape: torch.Size([1, 1, 22311, 22311]) Position ids shape: torch.Size([1, 22311]) Input IDs shape: torch.Size([1, 22311]) Labels shape: torch.Size([1, 22311]) Final batch size: 1, sequence length: 22915 Attention mask shape: torch.Size([1, 1, 22915, 22915]) Position ids shape: torch.Size([1, 22915]) Input IDs shape: torch.Size([1, 22915]) Labels shape: torch.Size([1, 22915]) Final batch size: 1, sequence length: 20106 Attention mask shape: torch.Size([1, 1, 20106, 20106]) Position ids shape: torch.Size([1, 20106]) Input IDs shape: torch.Size([1, 20106]) Labels shape: torch.Size([1, 20106]) Final batch size: 1, sequence length: 22777 Attention mask shape: torch.Size([1, 1, 22777, 22777]) Position ids shape: torch.Size([1, 22777]) Input IDs shape: torch.Size([1, 22777]) Labels shape: torch.Size([1, 22777]) Final batch size: 1, sequence length: 19286 Attention mask shape: torch.Size([1, 1, 19286, 19286]) Position ids shape: torch.Size([1, 19286]) Input IDs shape: torch.Size([1, 19286]) Labels shape: torch.Size([1, 19286]) Final batch size: 1, sequence length: 23942 Attention mask shape: torch.Size([1, 1, 23942, 23942]) Position ids shape: torch.Size([1, 23942]) Input IDs shape: torch.Size([1, 23942]) Labels shape: torch.Size([1, 23942]) Final batch size: 1, sequence length: 18819 Final batch size: 1, sequence length: 18988 Final batch size: 1, sequence length: 23995 Attention mask shape: torch.Size([1, 1, 18819, 18819]) Position ids shape: torch.Size([1, 18819]) Input IDs shape: torch.Size([1, 18819]) Labels shape: torch.Size([1, 18819]) Attention mask shape: torch.Size([1, 1, 18988, 18988]) Position ids shape: torch.Size([1, 18988]) Attention mask shape: torch.Size([1, 1, 23995, 23995]) Position ids shape: torch.Size([1, 23995]) Input IDs shape: torch.Size([1, 23995]) Labels shape: torch.Size([1, 23995]) Input IDs shape: torch.Size([1, 18988]) Labels shape: torch.Size([1, 18988]) Final batch size: 1, sequence length: 25001 Attention mask shape: torch.Size([1, 1, 25001, 25001]) Position ids shape: torch.Size([1, 25001]) Input IDs shape: torch.Size([1, 25001]) Labels shape: torch.Size([1, 25001]) Final batch size: 1, sequence length: 23341 Attention mask shape: torch.Size([1, 1, 23341, 23341]) Position ids shape: torch.Size([1, 23341]) Input IDs shape: torch.Size([1, 23341]) Labels shape: torch.Size([1, 23341]) Final batch size: 1, sequence length: 22979 Attention mask shape: torch.Size([1, 1, 22979, 22979]) Position ids shape: torch.Size([1, 22979]) Input IDs shape: torch.Size([1, 22979]) Labels shape: torch.Size([1, 22979]) Final batch size: 1, sequence length: 23886 Attention mask shape: torch.Size([1, 1, 23886, 23886]) Position ids shape: torch.Size([1, 23886]) Input IDs shape: torch.Size([1, 23886]) Labels shape: torch.Size([1, 23886]) Final batch size: 1, sequence length: 25021 Attention mask shape: torch.Size([1, 1, 25021, 25021]) Position ids shape: torch.Size([1, 25021]) Input IDs shape: torch.Size([1, 25021]) Labels shape: torch.Size([1, 25021]) Final batch size: 1, sequence length: 21660 Attention mask shape: torch.Size([1, 1, 21660, 21660]) Position ids shape: torch.Size([1, 21660]) Input IDs shape: torch.Size([1, 21660]) Labels shape: torch.Size([1, 21660]) Final batch size: 1, sequence length: 22160 Attention mask shape: torch.Size([1, 1, 22160, 22160]) Position ids shape: torch.Size([1, 22160]) Input IDs shape: torch.Size([1, 22160]) Labels shape: torch.Size([1, 22160]) Final batch size: 1, sequence length: 5405 Attention mask shape: torch.Size([1, 1, 5405, 5405]) Position ids shape: torch.Size([1, 5405]) Input IDs shape: torch.Size([1, 5405]) Labels shape: torch.Size([1, 5405]) Final batch size: 1, sequence length: 24287 Attention mask shape: torch.Size([1, 1, 24287, 24287]) Position ids shape: torch.Size([1, 24287]) Input IDs shape: torch.Size([1, 24287]) Labels shape: torch.Size([1, 24287]) Final batch size: 1, sequence length: 5288 Attention mask shape: torch.Size([1, 1, 5288, 5288]) Position ids shape: torch.Size([1, 5288]) Input IDs shape: torch.Size([1, 5288]) Labels shape: torch.Size([1, 5288]) Final batch size: 1, sequence length: 26461 Attention mask shape: torch.Size([1, 1, 26461, 26461]) Position ids shape: torch.Size([1, 26461]) Input IDs shape: torch.Size([1, 26461]) Labels shape: torch.Size([1, 26461]) Final batch size: 1, sequence length: 24293 Attention mask shape: torch.Size([1, 1, 24293, 24293]) Position ids shape: torch.Size([1, 24293]) Input IDs shape: torch.Size([1, 24293]) Labels shape: torch.Size([1, 24293]) Final batch size: 1, sequence length: 28263 Attention mask shape: torch.Size([1, 1, 28263, 28263]) Position ids shape: torch.Size([1, 28263]) Input IDs shape: torch.Size([1, 28263]) Labels shape: torch.Size([1, 28263]) Final batch size: 1, sequence length: 24407 Attention mask shape: torch.Size([1, 1, 24407, 24407]) Position ids shape: torch.Size([1, 24407]) Input IDs shape: torch.Size([1, 24407]) Labels shape: torch.Size([1, 24407]) Final batch size: 1, sequence length: 11266 Attention mask shape: torch.Size([1, 1, 11266, 11266]) Position ids shape: torch.Size([1, 11266]) Input IDs shape: torch.Size([1, 11266]) Labels shape: torch.Size([1, 11266]) Final batch size: 1, sequence length: 29009 Attention mask shape: torch.Size([1, 1, 29009, 29009]) Position ids shape: torch.Size([1, 29009]) Input IDs shape: torch.Size([1, 29009]) Labels shape: torch.Size([1, 29009]) Final batch size: 1, sequence length: 29109 Attention mask shape: torch.Size([1, 1, 29109, 29109]) Position ids shape: torch.Size([1, 29109]) Input IDs shape: torch.Size([1, 29109]) Labels shape: torch.Size([1, 29109]) Final batch size: 1, sequence length: 29561 Attention mask shape: torch.Size([1, 1, 29561, 29561]) Position ids shape: torch.Size([1, 29561]) Input IDs shape: torch.Size([1, 29561]) Labels shape: torch.Size([1, 29561]) Final batch size: 1, sequence length: 12483 Attention mask shape: torch.Size([1, 1, 12483, 12483]) Position ids shape: torch.Size([1, 12483]) Input IDs shape: torch.Size([1, 12483]) Labels shape: torch.Size([1, 12483]) Final batch size: 1, sequence length: 26179 Attention mask shape: torch.Size([1, 1, 26179, 26179]) Position ids shape: torch.Size([1, 26179]) Input IDs shape: torch.Size([1, 26179]) Labels shape: torch.Size([1, 26179]) Final batch size: 1, sequence length: 20941 Attention mask shape: torch.Size([1, 1, 20941, 20941]) Position ids shape: torch.Size([1, 20941]) Input IDs shape: torch.Size([1, 20941]) Labels shape: torch.Size([1, 20941]) Final batch size: 1, sequence length: 26886 Attention mask shape: torch.Size([1, 1, 26886, 26886]) Position ids shape: torch.Size([1, 26886]) Input IDs shape: torch.Size([1, 26886]) Labels shape: torch.Size([1, 26886]) Final batch size: 1, sequence length: 23724 Final batch size: 1, sequence length: 28440 Attention mask shape: torch.Size([1, 1, 23724, 23724]) Position ids shape: torch.Size([1, 23724]) Input IDs shape: torch.Size([1, 23724]) Labels shape: torch.Size([1, 23724]) Attention mask shape: torch.Size([1, 1, 28440, 28440]) Position ids shape: torch.Size([1, 28440]) Input IDs shape: torch.Size([1, 28440]) Labels shape: torch.Size([1, 28440]) Final batch size: 1, sequence length: 30723 Attention mask shape: torch.Size([1, 1, 30723, 30723]) Position ids shape: torch.Size([1, 30723]) Input IDs shape: torch.Size([1, 30723]) Labels shape: torch.Size([1, 30723]) Final batch size: 1, sequence length: 26054 Attention mask shape: torch.Size([1, 1, 26054, 26054]) Position ids shape: torch.Size([1, 26054]) Input IDs shape: torch.Size([1, 26054]) Labels shape: torch.Size([1, 26054]) Final batch size: 1, sequence length: 29152 Attention mask shape: torch.Size([1, 1, 29152, 29152]) Position ids shape: torch.Size([1, 29152]) Input IDs shape: torch.Size([1, 29152]) Labels shape: torch.Size([1, 29152]) Final batch size: 1, sequence length: 17539 Attention mask shape: torch.Size([1, 1, 17539, 17539]) Position ids shape: torch.Size([1, 17539]) Input IDs shape: torch.Size([1, 17539]) Labels shape: torch.Size([1, 17539]) Final batch size: 1, sequence length: 17649 Attention mask shape: torch.Size([1, 1, 17649, 17649]) Position ids shape: torch.Size([1, 17649]) Input IDs shape: torch.Size([1, 17649]) Labels shape: torch.Size([1, 17649]) Final batch size: 1, sequence length: 30236 Attention mask shape: torch.Size([1, 1, 30236, 30236]) Position ids shape: torch.Size([1, 30236]) Input IDs shape: torch.Size([1, 30236]) Labels shape: torch.Size([1, 30236]) Final batch size: 1, sequence length: 32660 Attention mask shape: torch.Size([1, 1, 32660, 32660]) Position ids shape: torch.Size([1, 32660]) Input IDs shape: torch.Size([1, 32660]) Labels shape: torch.Size([1, 32660]) Final batch size: 1, sequence length: 21472 Attention mask shape: torch.Size([1, 1, 21472, 21472]) Position ids shape: torch.Size([1, 21472]) Input IDs shape: torch.Size([1, 21472]) Labels shape: torch.Size([1, 21472]) Final batch size: 1, sequence length: 32799 Attention mask shape: torch.Size([1, 1, 32799, 32799]) Position ids shape: torch.Size([1, 32799]) Input IDs shape: torch.Size([1, 32799]) Labels shape: torch.Size([1, 32799]) Final batch size: 1, sequence length: 27405 Attention mask shape: torch.Size([1, 1, 27405, 27405]) Position ids shape: torch.Size([1, 27405]) Input IDs shape: torch.Size([1, 27405]) Labels shape: torch.Size([1, 27405]) Final batch size: 1, sequence length: 13600 Attention mask shape: torch.Size([1, 1, 13600, 13600]) Position ids shape: torch.Size([1, 13600]) Input IDs shape: torch.Size([1, 13600]) Labels shape: torch.Size([1, 13600]) Final batch size: 1, sequence length: 17951 Attention mask shape: torch.Size([1, 1, 17951, 17951]) Position ids shape: torch.Size([1, 17951]) Input IDs shape: torch.Size([1, 17951]) Labels shape: torch.Size([1, 17951]) Final batch size: 1, sequence length: 24056 Attention mask shape: torch.Size([1, 1, 24056, 24056]) Position ids shape: torch.Size([1, 24056]) Input IDs shape: torch.Size([1, 24056]) Labels shape: torch.Size([1, 24056]) Final batch size: 1, sequence length: 29237 Attention mask shape: torch.Size([1, 1, 29237, 29237]) Position ids shape: torch.Size([1, 29237]) Input IDs shape: torch.Size([1, 29237]) Labels shape: torch.Size([1, 29237]) Final batch size: 1, sequence length: 29880 Attention mask shape: torch.Size([1, 1, 29880, 29880]) Position ids shape: torch.Size([1, 29880]) Input IDs shape: torch.Size([1, 29880]) Labels shape: torch.Size([1, 29880]) Final batch size: 1, sequence length: 32083 Attention mask shape: torch.Size([1, 1, 32083, 32083]) Position ids shape: torch.Size([1, 32083]) Input IDs shape: torch.Size([1, 32083]) Labels shape: torch.Size([1, 32083]) Final batch size: 1, sequence length: 26452 Attention mask shape: torch.Size([1, 1, 26452, 26452]) Position ids shape: torch.Size([1, 26452]) Input IDs shape: torch.Size([1, 26452]) Labels shape: torch.Size([1, 26452]) Final batch size: 1, sequence length: 32287 Attention mask shape: torch.Size([1, 1, 32287, 32287]) Position ids shape: torch.Size([1, 32287]) Input IDs shape: torch.Size([1, 32287]) Labels shape: torch.Size([1, 32287]) Final batch size: 1, sequence length: 34694 Attention mask shape: torch.Size([1, 1, 34694, 34694]) Position ids shape: torch.Size([1, 34694]) Input IDs shape: torch.Size([1, 34694]) Labels shape: torch.Size([1, 34694]) Final batch size: 1, sequence length: 31512 Attention mask shape: torch.Size([1, 1, 31512, 31512]) Position ids shape: torch.Size([1, 31512]) Input IDs shape: torch.Size([1, 31512]) Labels shape: torch.Size([1, 31512]) Final batch size: 1, sequence length: 15906 Attention mask shape: torch.Size([1, 1, 15906, 15906]) Position ids shape: torch.Size([1, 15906]) Input IDs shape: torch.Size([1, 15906]) Labels shape: torch.Size([1, 15906]) Final batch size: 1, sequence length: 8008 Attention mask shape: torch.Size([1, 1, 8008, 8008]) Position ids shape: torch.Size([1, 8008]) Input IDs shape: torch.Size([1, 8008]) Labels shape: torch.Size([1, 8008]) Final batch size: 1, sequence length: 27987 Attention mask shape: torch.Size([1, 1, 27987, 27987]) Position ids shape: torch.Size([1, 27987]) Input IDs shape: torch.Size([1, 27987]) Labels shape: torch.Size([1, 27987]) Final batch size: 1, sequence length: 23945 Attention mask shape: torch.Size([1, 1, 23945, 23945]) Position ids shape: torch.Size([1, 23945]) Input IDs shape: torch.Size([1, 23945]) Labels shape: torch.Size([1, 23945]) Final batch size: 1, sequence length: 15830 Attention mask shape: torch.Size([1, 1, 15830, 15830]) Position ids shape: torch.Size([1, 15830]) Input IDs shape: torch.Size([1, 15830]) Labels shape: torch.Size([1, 15830]) Final batch size: 1, sequence length: 17775 Attention mask shape: torch.Size([1, 1, 17775, 17775]) Position ids shape: torch.Size([1, 17775]) Input IDs shape: torch.Size([1, 17775]) Labels shape: torch.Size([1, 17775]) Final batch size: 1, sequence length: 35478 Attention mask shape: torch.Size([1, 1, 35478, 35478]) Position ids shape: torch.Size([1, 35478]) Input IDs shape: torch.Size([1, 35478]) Labels shape: torch.Size([1, 35478]) Final batch size: 1, sequence length: 20694 Attention mask shape: torch.Size([1, 1, 20694, 20694]) Position ids shape: torch.Size([1, 20694]) Input IDs shape: torch.Size([1, 20694]) Labels shape: torch.Size([1, 20694]) Final batch size: 1, sequence length: 35999 Attention mask shape: torch.Size([1, 1, 35999, 35999]) Position ids shape: torch.Size([1, 35999]) Input IDs shape: torch.Size([1, 35999]) Labels shape: torch.Size([1, 35999]) Final batch size: 1, sequence length: 7448 Attention mask shape: torch.Size([1, 1, 7448, 7448]) Position ids shape: torch.Size([1, 7448]) Input IDs shape: torch.Size([1, 7448]) Labels shape: torch.Size([1, 7448]) Final batch size: 1, sequence length: 16852 Attention mask shape: torch.Size([1, 1, 16852, 16852]) Position ids shape: torch.Size([1, 16852]) Input IDs shape: torch.Size([1, 16852]) Labels shape: torch.Size([1, 16852]) Final batch size: 1, sequence length: 31786 Attention mask shape: torch.Size([1, 1, 31786, 31786]) Position ids shape: torch.Size([1, 31786]) Input IDs shape: torch.Size([1, 31786]) Labels shape: torch.Size([1, 31786]) Final batch size: 1, sequence length: 35696 Attention mask shape: torch.Size([1, 1, 35696, 35696]) Position ids shape: torch.Size([1, 35696]) Input IDs shape: torch.Size([1, 35696]) Labels shape: torch.Size([1, 35696]) Final batch size: 1, sequence length: 34777 Attention mask shape: torch.Size([1, 1, 34777, 34777]) Position ids shape: torch.Size([1, 34777]) Input IDs shape: torch.Size([1, 34777]) Labels shape: torch.Size([1, 34777]) Final batch size: 1, sequence length: 28879 Attention mask shape: torch.Size([1, 1, 28879, 28879]) Position ids shape: torch.Size([1, 28879]) Input IDs shape: torch.Size([1, 28879]) Labels shape: torch.Size([1, 28879]) Final batch size: 1, sequence length: 29262 Attention mask shape: torch.Size([1, 1, 29262, 29262]) Position ids shape: torch.Size([1, 29262]) Input IDs shape: torch.Size([1, 29262]) Labels shape: torch.Size([1, 29262]) Final batch size: 1, sequence length: 28691 Attention mask shape: torch.Size([1, 1, 28691, 28691]) Position ids shape: torch.Size([1, 28691]) Input IDs shape: torch.Size([1, 28691]) Labels shape: torch.Size([1, 28691]) Final batch size: 1, sequence length: 24824 Attention mask shape: torch.Size([1, 1, 24824, 24824]) Position ids shape: torch.Size([1, 24824]) Input IDs shape: torch.Size([1, 24824]) Labels shape: torch.Size([1, 24824]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 16631 Attention mask shape: torch.Size([1, 1, 16631, 16631]) Position ids shape: torch.Size([1, 16631]) Input IDs shape: torch.Size([1, 16631]) Labels shape: torch.Size([1, 16631]) Final batch size: 1, sequence length: 35526 Attention mask shape: torch.Size([1, 1, 35526, 35526]) Position ids shape: torch.Size([1, 35526]) Input IDs shape: torch.Size([1, 35526]) Labels shape: torch.Size([1, 35526]) Final batch size: 1, sequence length: 36314 Attention mask shape: torch.Size([1, 1, 36314, 36314]) Position ids shape: torch.Size([1, 36314]) Input IDs shape: torch.Size([1, 36314]) Labels shape: torch.Size([1, 36314]) Final batch size: 1, sequence length: 6378 Attention mask shape: torch.Size([1, 1, 6378, 6378]) Position ids shape: torch.Size([1, 6378]) Input IDs shape: torch.Size([1, 6378]) Labels shape: torch.Size([1, 6378]) Final batch size: 1, sequence length: 38935 Attention mask shape: torch.Size([1, 1, 38935, 38935]) Position ids shape: torch.Size([1, 38935]) Input IDs shape: torch.Size([1, 38935]) Labels shape: torch.Size([1, 38935]) Final batch size: 1, sequence length: 37933 Attention mask shape: torch.Size([1, 1, 37933, 37933]) Position ids shape: torch.Size([1, 37933]) Input IDs shape: torch.Size([1, 37933]) Labels shape: torch.Size([1, 37933]) Final batch size: 1, sequence length: 39476 Attention mask shape: torch.Size([1, 1, 39476, 39476]) Position ids shape: torch.Size([1, 39476]) Input IDs shape: torch.Size([1, 39476]) Labels shape: torch.Size([1, 39476]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 10480 Attention mask shape: torch.Size([1, 1, 10480, 10480]) Position ids shape: torch.Size([1, 10480]) Input IDs shape: torch.Size([1, 10480]) Labels shape: torch.Size([1, 10480]) Final batch size: 1, sequence length: 31132 Attention mask shape: torch.Size([1, 1, 31132, 31132]) Position ids shape: torch.Size([1, 31132]) Input IDs shape: torch.Size([1, 31132]) Labels shape: torch.Size([1, 31132]) Final batch size: 1, sequence length: 28495 Attention mask shape: torch.Size([1, 1, 28495, 28495]) Position ids shape: torch.Size([1, 28495]) Input IDs shape: torch.Size([1, 28495]) Labels shape: torch.Size([1, 28495]) Final batch size: 1, sequence length: 11395 Attention mask shape: torch.Size([1, 1, 11395, 11395]) Position ids shape: torch.Size([1, 11395]) Input IDs shape: torch.Size([1, 11395]) Labels shape: torch.Size([1, 11395]) Final batch size: 1, sequence length: 20176 Attention mask shape: torch.Size([1, 1, 20176, 20176]) Position ids shape: torch.Size([1, 20176]) Input IDs shape: torch.Size([1, 20176]) Labels shape: torch.Size([1, 20176]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 14869 Attention mask shape: torch.Size([1, 1, 14869, 14869]) Position ids shape: torch.Size([1, 14869]) Input IDs shape: torch.Size([1, 14869]) Labels shape: torch.Size([1, 14869]) Final batch size: 1, sequence length: 35116 Attention mask shape: torch.Size([1, 1, 35116, 35116]) Position ids shape: torch.Size([1, 35116]) Input IDs shape: torch.Size([1, 35116]) Labels shape: torch.Size([1, 35116]) Final batch size: 1, sequence length: 39687 Attention mask shape: torch.Size([1, 1, 39687, 39687]) Position ids shape: torch.Size([1, 39687]) Input IDs shape: torch.Size([1, 39687]) Labels shape: torch.Size([1, 39687]) Final batch size: 1, sequence length: 18127 Attention mask shape: torch.Size([1, 1, 18127, 18127]) Position ids shape: torch.Size([1, 18127]) Input IDs shape: torch.Size([1, 18127]) Labels shape: torch.Size([1, 18127]) Final batch size: 1, sequence length: 37159 Attention mask shape: torch.Size([1, 1, 37159, 37159]) Position ids shape: torch.Size([1, 37159]) Input IDs shape: torch.Size([1, 37159]) Labels shape: torch.Size([1, 37159]) Final batch size: 1, sequence length: 18911 Attention mask shape: torch.Size([1, 1, 18911, 18911]) Position ids shape: torch.Size([1, 18911]) Input IDs shape: torch.Size([1, 18911]) Labels shape: torch.Size([1, 18911]) Final batch size: 1, sequence length: 30024 Attention mask shape: torch.Size([1, 1, 30024, 30024]) Position ids shape: torch.Size([1, 30024]) Input IDs shape: torch.Size([1, 30024]) Labels shape: torch.Size([1, 30024]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40605 Attention mask shape: torch.Size([1, 1, 40605, 40605]) Position ids shape: torch.Size([1, 40605]) Input IDs shape: torch.Size([1, 40605]) Labels shape: torch.Size([1, 40605]) Final batch size: 1, sequence length: 36469 Attention mask shape: torch.Size([1, 1, 36469, 36469]) Position ids shape: torch.Size([1, 36469]) Input IDs shape: torch.Size([1, 36469]) Labels shape: torch.Size([1, 36469]) Final batch size: 1, sequence length: 28313 Attention mask shape: torch.Size([1, 1, 28313, 28313]) Position ids shape: torch.Size([1, 28313]) Input IDs shape: torch.Size([1, 28313]) Labels shape: torch.Size([1, 28313]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 39324 Attention mask shape: torch.Size([1, 1, 39324, 39324]) Position ids shape: torch.Size([1, 39324]) Input IDs shape: torch.Size([1, 39324]) Labels shape: torch.Size([1, 39324]) Final batch size: 1, sequence length: 30245 Attention mask shape: torch.Size([1, 1, 30245, 30245]) Position ids shape: torch.Size([1, 30245]) Input IDs shape: torch.Size([1, 30245]) Labels shape: torch.Size([1, 30245]) Final batch size: 1, sequence length: 33070 Attention mask shape: torch.Size([1, 1, 33070, 33070]) Position ids shape: torch.Size([1, 33070]) Input IDs shape: torch.Size([1, 33070]) Labels shape: torch.Size([1, 33070]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 26402 Attention mask shape: torch.Size([1, 1, 26402, 26402]) Position ids shape: torch.Size([1, 26402]) Input IDs shape: torch.Size([1, 26402]) Labels shape: torch.Size([1, 26402]) {'loss': 0.2363, 'grad_norm': 0.1702038484617533, 'learning_rate': 1.0926199633097156e-07, 'num_tokens': -inf, 'epoch': 7.62} Final batch size: 1, sequence length: 3968 Attention mask shape: torch.Size([1, 1, 3968, 3968]) Position ids shape: torch.Size([1, 3968]) Input IDs shape: torch.Size([1, 3968]) Labels shape: torch.Size([1, 3968]) Final batch size: 1, sequence length: 6567 Attention mask shape: torch.Size([1, 1, 6567, 6567]) Position ids shape: torch.Size([1, 6567]) Input IDs shape: torch.Size([1, 6567]) Labels shape: torch.Size([1, 6567]) Final batch size: 1, sequence length: 8347 Attention mask shape: torch.Size([1, 1, 8347, 8347]) Position ids shape: torch.Size([1, 8347]) Input IDs shape: torch.Size([1, 8347]) Labels shape: torch.Size([1, 8347]) Final batch size: 1, sequence length: 5997 Attention mask shape: torch.Size([1, 1, 5997, 5997]) Position ids shape: torch.Size([1, 5997]) Input IDs shape: torch.Size([1, 5997]) Labels shape: torch.Size([1, 5997]) Final batch size: 1, sequence length: 11752 Attention mask shape: torch.Size([1, 1, 11752, 11752]) Position ids shape: torch.Size([1, 11752]) Input IDs shape: torch.Size([1, 11752]) Labels shape: torch.Size([1, 11752]) Final batch size: 1, sequence length: 11880 Attention mask shape: torch.Size([1, 1, 11880, 11880]) Position ids shape: torch.Size([1, 11880]) Input IDs shape: torch.Size([1, 11880]) Labels shape: torch.Size([1, 11880]) Final batch size: 1, sequence length: 11054 Attention mask shape: torch.Size([1, 1, 11054, 11054]) Position ids shape: torch.Size([1, 11054]) Input IDs shape: torch.Size([1, 11054]) Labels shape: torch.Size([1, 11054]) Final batch size: 1, sequence length: 13260 Attention mask shape: torch.Size([1, 1, 13260, 13260]) Position ids shape: torch.Size([1, 13260]) Input IDs shape: torch.Size([1, 13260]) Labels shape: torch.Size([1, 13260]) Final batch size: 1, sequence length: 11232 Attention mask shape: torch.Size([1, 1, 11232, 11232]) Position ids shape: torch.Size([1, 11232]) Input IDs shape: torch.Size([1, 11232]) Labels shape: torch.Size([1, 11232]) Final batch size: 1, sequence length: 14758 Attention mask shape: torch.Size([1, 1, 14758, 14758]) Position ids shape: torch.Size([1, 14758]) Input IDs shape: torch.Size([1, 14758]) Labels shape: torch.Size([1, 14758]) Final batch size: 1, sequence length: 9027 Attention mask shape: torch.Size([1, 1, 9027, 9027]) Position ids shape: torch.Size([1, 9027]) Input IDs shape: torch.Size([1, 9027]) Labels shape: torch.Size([1, 9027]) Final batch size: 1, sequence length: 12370 Attention mask shape: torch.Size([1, 1, 12370, 12370]) Position ids shape: torch.Size([1, 12370]) Input IDs shape: torch.Size([1, 12370]) Labels shape: torch.Size([1, 12370]) Final batch size: 1, sequence length: 14597 Attention mask shape: torch.Size([1, 1, 14597, 14597]) Position ids shape: torch.Size([1, 14597]) Input IDs shape: torch.Size([1, 14597]) Labels shape: torch.Size([1, 14597]) Final batch size: 1, sequence length: 14512 Attention mask shape: torch.Size([1, 1, 14512, 14512]) Position ids shape: torch.Size([1, 14512]) Input IDs shape: torch.Size([1, 14512]) Labels shape: torch.Size([1, 14512]) Final batch size: 1, sequence length: 12967 Attention mask shape: torch.Size([1, 1, 12967, 12967]) Position ids shape: torch.Size([1, 12967]) Input IDs shape: torch.Size([1, 12967]) Labels shape: torch.Size([1, 12967]) Final batch size: 1, sequence length: 16014 Attention mask shape: torch.Size([1, 1, 16014, 16014]) Position ids shape: torch.Size([1, 16014]) Input IDs shape: torch.Size([1, 16014]) Labels shape: torch.Size([1, 16014]) Final batch size: 1, sequence length: 17166 Attention mask shape: torch.Size([1, 1, 17166, 17166]) Position ids shape: torch.Size([1, 17166]) Input IDs shape: torch.Size([1, 17166]) Labels shape: torch.Size([1, 17166]) Final batch size: 1, sequence length: 18050 Attention mask shape: torch.Size([1, 1, 18050, 18050]) Position ids shape: torch.Size([1, 18050]) Input IDs shape: torch.Size([1, 18050]) Labels shape: torch.Size([1, 18050]) Final batch size: 1, sequence length: 15438 Attention mask shape: torch.Size([1, 1, 15438, 15438]) Position ids shape: torch.Size([1, 15438]) Input IDs shape: torch.Size([1, 15438]) Labels shape: torch.Size([1, 15438]) Final batch size: 1, sequence length: 11206 Attention mask shape: torch.Size([1, 1, 11206, 11206]) Position ids shape: torch.Size([1, 11206]) Input IDs shape: torch.Size([1, 11206]) Labels shape: torch.Size([1, 11206]) Final batch size: 1, sequence length: 17092 Attention mask shape: torch.Size([1, 1, 17092, 17092]) Position ids shape: torch.Size([1, 17092]) Input IDs shape: torch.Size([1, 17092]) Labels shape: torch.Size([1, 17092]) Final batch size: 1, sequence length: 18131 Attention mask shape: torch.Size([1, 1, 18131, 18131]) Position ids shape: torch.Size([1, 18131]) Input IDs shape: torch.Size([1, 18131]) Labels shape: torch.Size([1, 18131]) Final batch size: 1, sequence length: 17839 Attention mask shape: torch.Size([1, 1, 17839, 17839]) Position ids shape: torch.Size([1, 17839]) Input IDs shape: torch.Size([1, 17839]) Labels shape: torch.Size([1, 17839]) Final batch size: 1, sequence length: 18264 Attention mask shape: torch.Size([1, 1, 18264, 18264]) Position ids shape: torch.Size([1, 18264]) Input IDs shape: torch.Size([1, 18264]) Labels shape: torch.Size([1, 18264]) Final batch size: 1, sequence length: 20947 Attention mask shape: torch.Size([1, 1, 20947, 20947]) Position ids shape: torch.Size([1, 20947]) Input IDs shape: torch.Size([1, 20947]) Labels shape: torch.Size([1, 20947]) Final batch size: 1, sequence length: 14520 Attention mask shape: torch.Size([1, 1, 14520, 14520]) Position ids shape: torch.Size([1, 14520]) Input IDs shape: torch.Size([1, 14520]) Labels shape: torch.Size([1, 14520]) Final batch size: 1, sequence length: 14335 Attention mask shape: torch.Size([1, 1, 14335, 14335]) Position ids shape: torch.Size([1, 14335]) Input IDs shape: torch.Size([1, 14335]) Labels shape: torch.Size([1, 14335]) Final batch size: 1, sequence length: 22107 Attention mask shape: torch.Size([1, 1, 22107, 22107]) Position ids shape: torch.Size([1, 22107]) Input IDs shape: torch.Size([1, 22107]) Labels shape: torch.Size([1, 22107]) Final batch size: 1, sequence length: 20487 Attention mask shape: torch.Size([1, 1, 20487, 20487]) Position ids shape: torch.Size([1, 20487]) Input IDs shape: torch.Size([1, 20487]) Labels shape: torch.Size([1, 20487]) Final batch size: 1, sequence length: 13061 Attention mask shape: torch.Size([1, 1, 13061, 13061]) Position ids shape: torch.Size([1, 13061]) Input IDs shape: torch.Size([1, 13061]) Labels shape: torch.Size([1, 13061]) Final batch size: 1, sequence length: 18950 Attention mask shape: torch.Size([1, 1, 18950, 18950]) Position ids shape: torch.Size([1, 18950]) Input IDs shape: torch.Size([1, 18950]) Labels shape: torch.Size([1, 18950]) Final batch size: 1, sequence length: 22547 Attention mask shape: torch.Size([1, 1, 22547, 22547]) Position ids shape: torch.Size([1, 22547]) Input IDs shape: torch.Size([1, 22547]) Labels shape: torch.Size([1, 22547]) Final batch size: 1, sequence length: 24432 Attention mask shape: torch.Size([1, 1, 24432, 24432]) Position ids shape: torch.Size([1, 24432]) Input IDs shape: torch.Size([1, 24432]) Labels shape: torch.Size([1, 24432]) Final batch size: 1, sequence length: 17194 Attention mask shape: torch.Size([1, 1, 17194, 17194]) Position ids shape: torch.Size([1, 17194]) Input IDs shape: torch.Size([1, 17194]) Labels shape: torch.Size([1, 17194]) Final batch size: 1, sequence length: 21137 Attention mask shape: torch.Size([1, 1, 21137, 21137]) Position ids shape: torch.Size([1, 21137]) Input IDs shape: torch.Size([1, 21137]) Labels shape: torch.Size([1, 21137]) Final batch size: 1, sequence length: 24566 Attention mask shape: torch.Size([1, 1, 24566, 24566]) Position ids shape: torch.Size([1, 24566]) Input IDs shape: torch.Size([1, 24566]) Labels shape: torch.Size([1, 24566]) Final batch size: 1, sequence length: 26068 Attention mask shape: torch.Size([1, 1, 26068, 26068]) Position ids shape: torch.Size([1, 26068]) Input IDs shape: torch.Size([1, 26068]) Labels shape: torch.Size([1, 26068]) Final batch size: 1, sequence length: 23324 Attention mask shape: torch.Size([1, 1, 23324, 23324]) Position ids shape: torch.Size([1, 23324]) Input IDs shape: torch.Size([1, 23324]) Labels shape: torch.Size([1, 23324]) Final batch size: 1, sequence length: 23960 Attention mask shape: torch.Size([1, 1, 23960, 23960]) Position ids shape: torch.Size([1, 23960]) Input IDs shape: torch.Size([1, 23960]) Labels shape: torch.Size([1, 23960]) Final batch size: 1, sequence length: 17115 Attention mask shape: torch.Size([1, 1, 17115, 17115]) Position ids shape: torch.Size([1, 17115]) Input IDs shape: torch.Size([1, 17115]) Labels shape: torch.Size([1, 17115]) Final batch size: 1, sequence length: 26333 Attention mask shape: torch.Size([1, 1, 26333, 26333]) Position ids shape: torch.Size([1, 26333]) Input IDs shape: torch.Size([1, 26333]) Labels shape: torch.Size([1, 26333]) Final batch size: 1, sequence length: 10286 Attention mask shape: torch.Size([1, 1, 10286, 10286]) Position ids shape: torch.Size([1, 10286]) Input IDs shape: torch.Size([1, 10286]) Labels shape: torch.Size([1, 10286]) Final batch size: 1, sequence length: 15229 Attention mask shape: torch.Size([1, 1, 15229, 15229]) Position ids shape: torch.Size([1, 15229]) Input IDs shape: torch.Size([1, 15229]) Labels shape: torch.Size([1, 15229]) Final batch size: 1, sequence length: 25252 Attention mask shape: torch.Size([1, 1, 25252, 25252]) Position ids shape: torch.Size([1, 25252]) Input IDs shape: torch.Size([1, 25252]) Labels shape: torch.Size([1, 25252]) Final batch size: 1, sequence length: 27278 Attention mask shape: torch.Size([1, 1, 27278, 27278]) Position ids shape: torch.Size([1, 27278]) Input IDs shape: torch.Size([1, 27278]) Labels shape: torch.Size([1, 27278]) Final batch size: 1, sequence length: 18051 Attention mask shape: torch.Size([1, 1, 18051, 18051]) Position ids shape: torch.Size([1, 18051]) Input IDs shape: torch.Size([1, 18051]) Labels shape: torch.Size([1, 18051]) Final batch size: 1, sequence length: 26937 Attention mask shape: torch.Size([1, 1, 26937, 26937]) Position ids shape: torch.Size([1, 26937]) Input IDs shape: torch.Size([1, 26937]) Labels shape: torch.Size([1, 26937]) Final batch size: 1, sequence length: 26271 Attention mask shape: torch.Size([1, 1, 26271, 26271]) Position ids shape: torch.Size([1, 26271]) Input IDs shape: torch.Size([1, 26271]) Labels shape: torch.Size([1, 26271]) Final batch size: 1, sequence length: 21728 Attention mask shape: torch.Size([1, 1, 21728, 21728]) Position ids shape: torch.Size([1, 21728]) Input IDs shape: torch.Size([1, 21728]) Labels shape: torch.Size([1, 21728]) Final batch size: 1, sequence length: 28284 Attention mask shape: torch.Size([1, 1, 28284, 28284]) Position ids shape: torch.Size([1, 28284]) Input IDs shape: torch.Size([1, 28284]) Labels shape: torch.Size([1, 28284]) Final batch size: 1, sequence length: 29592 Attention mask shape: torch.Size([1, 1, 29592, 29592]) Position ids shape: torch.Size([1, 29592]) Input IDs shape: torch.Size([1, 29592]) Labels shape: torch.Size([1, 29592]) Final batch size: 1, sequence length: 29478 Attention mask shape: torch.Size([1, 1, 29478, 29478]) Position ids shape: torch.Size([1, 29478]) Input IDs shape: torch.Size([1, 29478]) Labels shape: torch.Size([1, 29478]) Final batch size: 1, sequence length: 29236 Attention mask shape: torch.Size([1, 1, 29236, 29236]) Position ids shape: torch.Size([1, 29236]) Input IDs shape: torch.Size([1, 29236]) Labels shape: torch.Size([1, 29236]) Final batch size: 1, sequence length: 31377 Attention mask shape: torch.Size([1, 1, 31377, 31377]) Position ids shape: torch.Size([1, 31377]) Input IDs shape: torch.Size([1, 31377]) Labels shape: torch.Size([1, 31377]) Final batch size: 1, sequence length: 19888 Attention mask shape: torch.Size([1, 1, 19888, 19888]) Position ids shape: torch.Size([1, 19888]) Input IDs shape: torch.Size([1, 19888]) Labels shape: torch.Size([1, 19888]) Final batch size: 1, sequence length: 29768 Attention mask shape: torch.Size([1, 1, 29768, 29768]) Position ids shape: torch.Size([1, 29768]) Input IDs shape: torch.Size([1, 29768]) Labels shape: torch.Size([1, 29768]) Final batch size: 1, sequence length: 27293 Attention mask shape: torch.Size([1, 1, 27293, 27293]) Position ids shape: torch.Size([1, 27293]) Input IDs shape: torch.Size([1, 27293]) Labels shape: torch.Size([1, 27293]) Final batch size: 1, sequence length: 28858 Attention mask shape: torch.Size([1, 1, 28858, 28858]) Position ids shape: torch.Size([1, 28858]) Input IDs shape: torch.Size([1, 28858]) Labels shape: torch.Size([1, 28858]) Final batch size: 1, sequence length: 7364 Attention mask shape: torch.Size([1, 1, 7364, 7364]) Position ids shape: torch.Size([1, 7364]) Input IDs shape: torch.Size([1, 7364]) Labels shape: torch.Size([1, 7364]) Final batch size: 1, sequence length: 30862 Attention mask shape: torch.Size([1, 1, 30862, 30862]) Position ids shape: torch.Size([1, 30862]) Input IDs shape: torch.Size([1, 30862]) Labels shape: torch.Size([1, 30862]) Final batch size: 1, sequence length: 32392 Attention mask shape: torch.Size([1, 1, 32392, 32392]) Position ids shape: torch.Size([1, 32392]) Input IDs shape: torch.Size([1, 32392]) Labels shape: torch.Size([1, 32392]) Final batch size: 1, sequence length: 28421 Attention mask shape: torch.Size([1, 1, 28421, 28421]) Position ids shape: torch.Size([1, 28421]) Input IDs shape: torch.Size([1, 28421]) Labels shape: torch.Size([1, 28421]) Final batch size: 1, sequence length: 13257 Attention mask shape: torch.Size([1, 1, 13257, 13257]) Position ids shape: torch.Size([1, 13257]) Input IDs shape: torch.Size([1, 13257]) Labels shape: torch.Size([1, 13257]) Final batch size: 1, sequence length: 31860 Attention mask shape: torch.Size([1, 1, 31860, 31860]) Position ids shape: torch.Size([1, 31860]) Input IDs shape: torch.Size([1, 31860]) Labels shape: torch.Size([1, 31860]) Final batch size: 1, sequence length: 32112 Attention mask shape: torch.Size([1, 1, 32112, 32112]) Position ids shape: torch.Size([1, 32112]) Input IDs shape: torch.Size([1, 32112]) Labels shape: torch.Size([1, 32112]) Final batch size: 1, sequence length: 18408 Attention mask shape: torch.Size([1, 1, 18408, 18408]) Position ids shape: torch.Size([1, 18408]) Input IDs shape: torch.Size([1, 18408]) Labels shape: torch.Size([1, 18408]) Final batch size: 1, sequence length: 33736 Attention mask shape: torch.Size([1, 1, 33736, 33736]) Position ids shape: torch.Size([1, 33736]) Input IDs shape: torch.Size([1, 33736]) Labels shape: torch.Size([1, 33736]) Final batch size: 1, sequence length: 33097 Attention mask shape: torch.Size([1, 1, 33097, 33097]) Position ids shape: torch.Size([1, 33097]) Input IDs shape: torch.Size([1, 33097]) Labels shape: torch.Size([1, 33097]) Final batch size: 1, sequence length: 35171 Attention mask shape: torch.Size([1, 1, 35171, 35171]) Position ids shape: torch.Size([1, 35171]) Input IDs shape: torch.Size([1, 35171]) Labels shape: torch.Size([1, 35171]) Final batch size: 1, sequence length: 31740 Attention mask shape: torch.Size([1, 1, 31740, 31740]) Position ids shape: torch.Size([1, 31740]) Input IDs shape: torch.Size([1, 31740]) Labels shape: torch.Size([1, 31740]) Final batch size: 1, sequence length: 32181 Attention mask shape: torch.Size([1, 1, 32181, 32181]) Position ids shape: torch.Size([1, 32181]) Input IDs shape: torch.Size([1, 32181]) Labels shape: torch.Size([1, 32181]) Final batch size: 1, sequence length: 29489 Attention mask shape: torch.Size([1, 1, 29489, 29489]) Position ids shape: torch.Size([1, 29489]) Input IDs shape: torch.Size([1, 29489]) Labels shape: torch.Size([1, 29489]) Final batch size: 1, sequence length: 35596 Attention mask shape: torch.Size([1, 1, 35596, 35596]) Position ids shape: torch.Size([1, 35596]) Input IDs shape: torch.Size([1, 35596]) Labels shape: torch.Size([1, 35596]) Final batch size: 1, sequence length: 9974 Attention mask shape: torch.Size([1, 1, 9974, 9974]) Position ids shape: torch.Size([1, 9974]) Input IDs shape: torch.Size([1, 9974]) Labels shape: torch.Size([1, 9974]) Final batch size: 1, sequence length: 32162 Attention mask shape: torch.Size([1, 1, 32162, 32162]) Position ids shape: torch.Size([1, 32162]) Input IDs shape: torch.Size([1, 32162]) Labels shape: torch.Size([1, 32162]) Final batch size: 1, sequence length: 37623 Attention mask shape: torch.Size([1, 1, 37623, 37623]) Position ids shape: torch.Size([1, 37623]) Input IDs shape: torch.Size([1, 37623]) Labels shape: torch.Size([1, 37623]) Final batch size: 1, sequence length: 18711 Attention mask shape: torch.Size([1, 1, 18711, 18711]) Position ids shape: torch.Size([1, 18711]) Input IDs shape: torch.Size([1, 18711]) Labels shape: torch.Size([1, 18711]) Final batch size: 1, sequence length: 25484 Attention mask shape: torch.Size([1, 1, 25484, 25484]) Position ids shape: torch.Size([1, 25484]) Input IDs shape: torch.Size([1, 25484]) Labels shape: torch.Size([1, 25484]) Final batch size: 1, sequence length: 30302 Attention mask shape: torch.Size([1, 1, 30302, 30302]) Position ids shape: torch.Size([1, 30302]) Input IDs shape: torch.Size([1, 30302]) Labels shape: torch.Size([1, 30302]) Final batch size: 1, sequence length: 34514 Attention mask shape: torch.Size([1, 1, 34514, 34514]) Position ids shape: torch.Size([1, 34514]) Input IDs shape: torch.Size([1, 34514]) Labels shape: torch.Size([1, 34514]) Final batch size: 1, sequence length: 32979 Attention mask shape: torch.Size([1, 1, 32979, 32979]) Position ids shape: torch.Size([1, 32979]) Input IDs shape: torch.Size([1, 32979]) Labels shape: torch.Size([1, 32979]) Final batch size: 1, sequence length: 35069 Attention mask shape: torch.Size([1, 1, 35069, 35069]) Position ids shape: torch.Size([1, 35069]) Input IDs shape: torch.Size([1, 35069]) Labels shape: torch.Size([1, 35069]) Final batch size: 1, sequence length: 38790 Attention mask shape: torch.Size([1, 1, 38790, 38790]) Position ids shape: torch.Size([1, 38790]) Input IDs shape: torch.Size([1, 38790]) Labels shape: torch.Size([1, 38790]) Final batch size: 1, sequence length: 39754 Attention mask shape: torch.Size([1, 1, 39754, 39754]) Position ids shape: torch.Size([1, 39754]) Input IDs shape: torch.Size([1, 39754]) Labels shape: torch.Size([1, 39754]) Final batch size: 1, sequence length: 21500 Attention mask shape: torch.Size([1, 1, 21500, 21500]) Position ids shape: torch.Size([1, 21500]) Input IDs shape: torch.Size([1, 21500]) Labels shape: torch.Size([1, 21500]) Final batch size: 1, sequence length: 13557 Attention mask shape: torch.Size([1, 1, 13557, 13557]) Position ids shape: torch.Size([1, 13557]) Input IDs shape: torch.Size([1, 13557]) Labels shape: torch.Size([1, 13557]) Final batch size: 1, sequence length: 30051 Attention mask shape: torch.Size([1, 1, 30051, 30051]) Position ids shape: torch.Size([1, 30051]) Input IDs shape: torch.Size([1, 30051]) Labels shape: torch.Size([1, 30051]) Final batch size: 1, sequence length: 17777 Attention mask shape: torch.Size([1, 1, 17777, 17777]) Position ids shape: torch.Size([1, 17777]) Input IDs shape: torch.Size([1, 17777]) Labels shape: torch.Size([1, 17777]) Final batch size: 1, sequence length: 21408 Attention mask shape: torch.Size([1, 1, 21408, 21408]) Position ids shape: torch.Size([1, 21408]) Input IDs shape: torch.Size([1, 21408]) Labels shape: torch.Size([1, 21408]) Final batch size: 1, sequence length: 38351 Attention mask shape: torch.Size([1, 1, 38351, 38351]) Position ids shape: torch.Size([1, 38351]) Input IDs shape: torch.Size([1, 38351]) Labels shape: torch.Size([1, 38351]) Final batch size: 1, sequence length: 34871 Attention mask shape: torch.Size([1, 1, 34871, 34871]) Position ids shape: torch.Size([1, 34871]) Input IDs shape: torch.Size([1, 34871]) Labels shape: torch.Size([1, 34871]) Final batch size: 1, sequence length: 32327 Attention mask shape: torch.Size([1, 1, 32327, 32327]) Position ids shape: torch.Size([1, 32327]) Input IDs shape: torch.Size([1, 32327]) Labels shape: torch.Size([1, 32327]) Final batch size: 1, sequence length: 24061 Attention mask shape: torch.Size([1, 1, 24061, 24061]) Position ids shape: torch.Size([1, 24061]) Input IDs shape: torch.Size([1, 24061]) Labels shape: torch.Size([1, 24061]) Final batch size: 1, sequence length: 30639 Attention mask shape: torch.Size([1, 1, 30639, 30639]) Position ids shape: torch.Size([1, 30639]) Input IDs shape: torch.Size([1, 30639]) Labels shape: torch.Size([1, 30639]) Final batch size: 1, sequence length: 36535 Attention mask shape: torch.Size([1, 1, 36535, 36535]) Position ids shape: torch.Size([1, 36535]) Input IDs shape: torch.Size([1, 36535]) Labels shape: torch.Size([1, 36535]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 31679 Attention mask shape: torch.Size([1, 1, 31679, 31679]) Position ids shape: torch.Size([1, 31679]) Input IDs shape: torch.Size([1, 31679]) Labels shape: torch.Size([1, 31679]) Final batch size: 1, sequence length: 34170 Attention mask shape: torch.Size([1, 1, 34170, 34170]) Position ids shape: torch.Size([1, 34170]) Input IDs shape: torch.Size([1, 34170]) Labels shape: torch.Size([1, 34170]) Final batch size: 1, sequence length: 25386 Attention mask shape: torch.Size([1, 1, 25386, 25386]) Position ids shape: torch.Size([1, 25386]) Input IDs shape: torch.Size([1, 25386]) Labels shape: torch.Size([1, 25386]) Final batch size: 1, sequence length: 24943 Attention mask shape: torch.Size([1, 1, 24943, 24943]) Position ids shape: torch.Size([1, 24943]) Input IDs shape: torch.Size([1, 24943]) Labels shape: torch.Size([1, 24943]) Final batch size: 1, sequence length: 23243 Attention mask shape: torch.Size([1, 1, 23243, 23243]) Position ids shape: torch.Size([1, 23243]) Input IDs shape: torch.Size([1, 23243]) Labels shape: torch.Size([1, 23243]) Final batch size: 1, sequence length: 15697 Attention mask shape: torch.Size([1, 1, 15697, 15697]) Position ids shape: torch.Size([1, 15697]) Input IDs shape: torch.Size([1, 15697]) Labels shape: torch.Size([1, 15697]) Final batch size: 1, sequence length: 33807 Attention mask shape: torch.Size([1, 1, 33807, 33807]) Position ids shape: torch.Size([1, 33807]) Input IDs shape: torch.Size([1, 33807]) Labels shape: torch.Size([1, 33807]) Final batch size: 1, sequence length: 31513 Attention mask shape: torch.Size([1, 1, 31513, 31513]) Position ids shape: torch.Size([1, 31513]) Input IDs shape: torch.Size([1, 31513]) Labels shape: torch.Size([1, 31513]) Final batch size: 1, sequence length: 21825 Attention mask shape: torch.Size([1, 1, 21825, 21825]) Position ids shape: torch.Size([1, 21825]) Input IDs shape: torch.Size([1, 21825]) Labels shape: torch.Size([1, 21825]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 24910 Attention mask shape: torch.Size([1, 1, 24910, 24910]) Position ids shape: torch.Size([1, 24910]) Input IDs shape: torch.Size([1, 24910]) Labels shape: torch.Size([1, 24910]) Final batch size: 1, sequence length: 24840 Attention mask shape: torch.Size([1, 1, 24840, 24840]) Position ids shape: torch.Size([1, 24840]) Input IDs shape: torch.Size([1, 24840]) Labels shape: torch.Size([1, 24840]) Final batch size: 1, sequence length: 18106 Attention mask shape: torch.Size([1, 1, 18106, 18106]) Position ids shape: torch.Size([1, 18106]) Input IDs shape: torch.Size([1, 18106]) Labels shape: torch.Size([1, 18106]) Final batch size: 1, sequence length: 33883 Attention mask shape: torch.Size([1, 1, 33883, 33883]) Position ids shape: torch.Size([1, 33883]) Input IDs shape: torch.Size([1, 33883]) Labels shape: torch.Size([1, 33883]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 30361 Attention mask shape: torch.Size([1, 1, 30361, 30361]) Position ids shape: torch.Size([1, 30361]) Input IDs shape: torch.Size([1, 30361]) Labels shape: torch.Size([1, 30361]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 15180 Attention mask shape: torch.Size([1, 1, 15180, 15180]) Position ids shape: torch.Size([1, 15180]) Input IDs shape: torch.Size([1, 15180]) Labels shape: torch.Size([1, 15180]) Final batch size: 1, sequence length: 29691 Attention mask shape: torch.Size([1, 1, 29691, 29691]) Position ids shape: torch.Size([1, 29691]) Input IDs shape: torch.Size([1, 29691]) Labels shape: torch.Size([1, 29691]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 39483 Attention mask shape: torch.Size([1, 1, 39483, 39483]) Position ids shape: torch.Size([1, 39483]) Input IDs shape: torch.Size([1, 39483]) Labels shape: torch.Size([1, 39483]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 27228 Attention mask shape: torch.Size([1, 1, 27228, 27228]) Position ids shape: torch.Size([1, 27228]) Input IDs shape: torch.Size([1, 27228]) Labels shape: torch.Size([1, 27228]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 30419 Attention mask shape: torch.Size([1, 1, 30419, 30419]) Position ids shape: torch.Size([1, 30419]) Input IDs shape: torch.Size([1, 30419]) Labels shape: torch.Size([1, 30419]) Final batch size: 1, sequence length: 32351 Attention mask shape: torch.Size([1, 1, 32351, 32351]) Position ids shape: torch.Size([1, 32351]) Input IDs shape: torch.Size([1, 32351]) Labels shape: torch.Size([1, 32351]) Final batch size: 1, sequence length: 33549 Attention mask shape: torch.Size([1, 1, 33549, 33549]) Position ids shape: torch.Size([1, 33549]) Input IDs shape: torch.Size([1, 33549]) Labels shape: torch.Size([1, 33549]) Final batch size: 1, sequence length: 13064 Attention mask shape: torch.Size([1, 1, 13064, 13064]) Position ids shape: torch.Size([1, 13064]) Input IDs shape: torch.Size([1, 13064]) Labels shape: torch.Size([1, 13064]) Final batch size: 1, sequence length: 36532 Attention mask shape: torch.Size([1, 1, 36532, 36532]) Position ids shape: torch.Size([1, 36532]) Input IDs shape: torch.Size([1, 36532]) Labels shape: torch.Size([1, 36532]) {'loss': 0.2492, 'grad_norm': 0.1531753558317694, 'learning_rate': 6.15582970243117e-08, 'num_tokens': -inf, 'epoch': 7.75} Final batch size: 1, sequence length: 8845 Attention mask shape: torch.Size([1, 1, 8845, 8845]) Position ids shape: torch.Size([1, 8845]) Input IDs shape: torch.Size([1, 8845]) Labels shape: torch.Size([1, 8845]) Final batch size: 1, sequence length: 7235 Attention mask shape: torch.Size([1, 1, 7235, 7235]) Position ids shape: torch.Size([1, 7235]) Input IDs shape: torch.Size([1, 7235]) Labels shape: torch.Size([1, 7235]) Final batch size: 1, sequence length: 12215 Attention mask shape: torch.Size([1, 1, 12215, 12215]) Position ids shape: torch.Size([1, 12215]) Input IDs shape: torch.Size([1, 12215]) Labels shape: torch.Size([1, 12215]) Final batch size: 1, sequence length: 12830 Attention mask shape: torch.Size([1, 1, 12830, 12830]) Position ids shape: torch.Size([1, 12830]) Input IDs shape: torch.Size([1, 12830]) Labels shape: torch.Size([1, 12830]) Final batch size: 1, sequence length: 13575 Attention mask shape: torch.Size([1, 1, 13575, 13575]) Position ids shape: torch.Size([1, 13575]) Input IDs shape: torch.Size([1, 13575]) Labels shape: torch.Size([1, 13575]) Final batch size: 1, sequence length: 12960 Attention mask shape: torch.Size([1, 1, 12960, 12960]) Position ids shape: torch.Size([1, 12960]) Input IDs shape: torch.Size([1, 12960]) Labels shape: torch.Size([1, 12960]) Final batch size: 1, sequence length: 14689 Attention mask shape: torch.Size([1, 1, 14689, 14689]) Position ids shape: torch.Size([1, 14689]) Input IDs shape: torch.Size([1, 14689]) Labels shape: torch.Size([1, 14689]) Final batch size: 1, sequence length: 12622 Attention mask shape: torch.Size([1, 1, 12622, 12622]) Position ids shape: torch.Size([1, 12622]) Input IDs shape: torch.Size([1, 12622]) Labels shape: torch.Size([1, 12622]) Final batch size: 1, sequence length: 13665 Attention mask shape: torch.Size([1, 1, 13665, 13665]) Position ids shape: torch.Size([1, 13665]) Input IDs shape: torch.Size([1, 13665]) Labels shape: torch.Size([1, 13665]) Final batch size: 1, sequence length: 14833 Attention mask shape: torch.Size([1, 1, 14833, 14833]) Position ids shape: torch.Size([1, 14833]) Input IDs shape: torch.Size([1, 14833]) Labels shape: torch.Size([1, 14833]) Final batch size: 1, sequence length: 17016 Attention mask shape: torch.Size([1, 1, 17016, 17016]) Position ids shape: torch.Size([1, 17016]) Input IDs shape: torch.Size([1, 17016]) Labels shape: torch.Size([1, 17016]) Final batch size: 1, sequence length: 16145 Attention mask shape: torch.Size([1, 1, 16145, 16145]) Position ids shape: torch.Size([1, 16145]) Input IDs shape: torch.Size([1, 16145]) Labels shape: torch.Size([1, 16145]) Final batch size: 1, sequence length: 17026 Attention mask shape: torch.Size([1, 1, 17026, 17026]) Position ids shape: torch.Size([1, 17026]) Input IDs shape: torch.Size([1, 17026]) Labels shape: torch.Size([1, 17026]) Final batch size: 1, sequence length: 12562 Attention mask shape: torch.Size([1, 1, 12562, 12562]) Position ids shape: torch.Size([1, 12562]) Input IDs shape: torch.Size([1, 12562]) Labels shape: torch.Size([1, 12562]) Final batch size: 1, sequence length: 14482 Attention mask shape: torch.Size([1, 1, 14482, 14482]) Position ids shape: torch.Size([1, 14482]) Input IDs shape: torch.Size([1, 14482]) Labels shape: torch.Size([1, 14482]) Final batch size: 1, sequence length: 15673 Attention mask shape: torch.Size([1, 1, 15673, 15673]) Position ids shape: torch.Size([1, 15673]) Input IDs shape: torch.Size([1, 15673]) Labels shape: torch.Size([1, 15673]) Final batch size: 1, sequence length: 15816 Attention mask shape: torch.Size([1, 1, 15816, 15816]) Position ids shape: torch.Size([1, 15816]) Input IDs shape: torch.Size([1, 15816]) Labels shape: torch.Size([1, 15816]) Final batch size: 1, sequence length: 17988 Attention mask shape: torch.Size([1, 1, 17988, 17988]) Position ids shape: torch.Size([1, 17988]) Input IDs shape: torch.Size([1, 17988]) Labels shape: torch.Size([1, 17988]) Final batch size: 1, sequence length: 18978 Attention mask shape: torch.Size([1, 1, 18978, 18978]) Position ids shape: torch.Size([1, 18978]) Input IDs shape: torch.Size([1, 18978]) Labels shape: torch.Size([1, 18978]) Final batch size: 1, sequence length: 17512 Attention mask shape: torch.Size([1, 1, 17512, 17512]) Position ids shape: torch.Size([1, 17512]) Input IDs shape: torch.Size([1, 17512]) Labels shape: torch.Size([1, 17512]) Final batch size: 1, sequence length: 18922 Attention mask shape: torch.Size([1, 1, 18922, 18922]) Position ids shape: torch.Size([1, 18922]) Input IDs shape: torch.Size([1, 18922]) Labels shape: torch.Size([1, 18922]) Final batch size: 1, sequence length: 20784 Attention mask shape: torch.Size([1, 1, 20784, 20784]) Position ids shape: torch.Size([1, 20784]) Input IDs shape: torch.Size([1, 20784]) Labels shape: torch.Size([1, 20784]) Final batch size: 1, sequence length: 19225 Attention mask shape: torch.Size([1, 1, 19225, 19225]) Position ids shape: torch.Size([1, 19225]) Input IDs shape: torch.Size([1, 19225]) Labels shape: torch.Size([1, 19225]) Final batch size: 1, sequence length: 20630 Attention mask shape: torch.Size([1, 1, 20630, 20630]) Position ids shape: torch.Size([1, 20630]) Input IDs shape: torch.Size([1, 20630]) Labels shape: torch.Size([1, 20630]) Final batch size: 1, sequence length: 20991 Attention mask shape: torch.Size([1, 1, 20991, 20991]) Position ids shape: torch.Size([1, 20991]) Input IDs shape: torch.Size([1, 20991]) Labels shape: torch.Size([1, 20991]) Final batch size: 1, sequence length: 21075 Attention mask shape: torch.Size([1, 1, 21075, 21075]) Position ids shape: torch.Size([1, 21075]) Input IDs shape: torch.Size([1, 21075]) Labels shape: torch.Size([1, 21075]) Final batch size: 1, sequence length: 20389 Attention mask shape: torch.Size([1, 1, 20389, 20389]) Position ids shape: torch.Size([1, 20389]) Input IDs shape: torch.Size([1, 20389]) Labels shape: torch.Size([1, 20389]) Final batch size: 1, sequence length: 21598 Attention mask shape: torch.Size([1, 1, 21598, 21598]) Position ids shape: torch.Size([1, 21598]) Input IDs shape: torch.Size([1, 21598]) Labels shape: torch.Size([1, 21598]) Final batch size: 1, sequence length: 19409 Attention mask shape: torch.Size([1, 1, 19409, 19409]) Position ids shape: torch.Size([1, 19409]) Input IDs shape: torch.Size([1, 19409]) Labels shape: torch.Size([1, 19409]) Final batch size: 1, sequence length: 12969 Attention mask shape: torch.Size([1, 1, 12969, 12969]) Position ids shape: torch.Size([1, 12969]) Input IDs shape: torch.Size([1, 12969]) Labels shape: torch.Size([1, 12969]) Final batch size: 1, sequence length: 14577 Attention mask shape: torch.Size([1, 1, 14577, 14577]) Position ids shape: torch.Size([1, 14577]) Input IDs shape: torch.Size([1, 14577]) Labels shape: torch.Size([1, 14577]) Final batch size: 1, sequence length: 10494 Attention mask shape: torch.Size([1, 1, 10494, 10494]) Position ids shape: torch.Size([1, 10494]) Input IDs shape: torch.Size([1, 10494]) Labels shape: torch.Size([1, 10494]) Final batch size: 1, sequence length: 20585 Attention mask shape: torch.Size([1, 1, 20585, 20585]) Position ids shape: torch.Size([1, 20585]) Input IDs shape: torch.Size([1, 20585]) Labels shape: torch.Size([1, 20585]) Final batch size: 1, sequence length: 22949 Attention mask shape: torch.Size([1, 1, 22949, 22949]) Position ids shape: torch.Size([1, 22949]) Input IDs shape: torch.Size([1, 22949]) Labels shape: torch.Size([1, 22949]) Final batch size: 1, sequence length: 6618 Attention mask shape: torch.Size([1, 1, 6618, 6618]) Position ids shape: torch.Size([1, 6618]) Input IDs shape: torch.Size([1, 6618]) Labels shape: torch.Size([1, 6618]) Final batch size: 1, sequence length: 23437 Attention mask shape: torch.Size([1, 1, 23437, 23437]) Position ids shape: torch.Size([1, 23437]) Input IDs shape: torch.Size([1, 23437]) Labels shape: torch.Size([1, 23437]) Final batch size: 1, sequence length: 23755 Attention mask shape: torch.Size([1, 1, 23755, 23755]) Position ids shape: torch.Size([1, 23755]) Input IDs shape: torch.Size([1, 23755]) Labels shape: torch.Size([1, 23755]) Final batch size: 1, sequence length: 21907 Attention mask shape: torch.Size([1, 1, 21907, 21907]) Position ids shape: torch.Size([1, 21907]) Input IDs shape: torch.Size([1, 21907]) Labels shape: torch.Size([1, 21907]) Final batch size: 1, sequence length: 20465 Attention mask shape: torch.Size([1, 1, 20465, 20465]) Position ids shape: torch.Size([1, 20465]) Input IDs shape: torch.Size([1, 20465]) Labels shape: torch.Size([1, 20465]) Final batch size: 1, sequence length: 15222 Attention mask shape: torch.Size([1, 1, 15222, 15222]) Position ids shape: torch.Size([1, 15222]) Input IDs shape: torch.Size([1, 15222]) Labels shape: torch.Size([1, 15222]) Final batch size: 1, sequence length: 14548 Attention mask shape: torch.Size([1, 1, 14548, 14548]) Position ids shape: torch.Size([1, 14548]) Input IDs shape: torch.Size([1, 14548]) Labels shape: torch.Size([1, 14548]) Final batch size: 1, sequence length: 22778 Attention mask shape: torch.Size([1, 1, 22778, 22778]) Position ids shape: torch.Size([1, 22778]) Input IDs shape: torch.Size([1, 22778]) Labels shape: torch.Size([1, 22778]) Final batch size: 1, sequence length: 19325 Attention mask shape: torch.Size([1, 1, 19325, 19325]) Position ids shape: torch.Size([1, 19325]) Input IDs shape: torch.Size([1, 19325]) Labels shape: torch.Size([1, 19325]) Final batch size: 1, sequence length: 23805 Attention mask shape: torch.Size([1, 1, 23805, 23805]) Position ids shape: torch.Size([1, 23805]) Input IDs shape: torch.Size([1, 23805]) Labels shape: torch.Size([1, 23805]) Final batch size: 1, sequence length: 23293 Attention mask shape: torch.Size([1, 1, 23293, 23293]) Position ids shape: torch.Size([1, 23293]) Input IDs shape: torch.Size([1, 23293]) Labels shape: torch.Size([1, 23293]) Final batch size: 1, sequence length: 3010 Attention mask shape: torch.Size([1, 1, 3010, 3010]) Position ids shape: torch.Size([1, 3010]) Input IDs shape: torch.Size([1, 3010]) Labels shape: torch.Size([1, 3010]) Final batch size: 1, sequence length: 19724 Attention mask shape: torch.Size([1, 1, 19724, 19724]) Position ids shape: torch.Size([1, 19724]) Input IDs shape: torch.Size([1, 19724]) Labels shape: torch.Size([1, 19724]) Final batch size: 1, sequence length: 13072 Attention mask shape: torch.Size([1, 1, 13072, 13072]) Position ids shape: torch.Size([1, 13072]) Input IDs shape: torch.Size([1, 13072]) Labels shape: torch.Size([1, 13072]) Final batch size: 1, sequence length: 21322 Attention mask shape: torch.Size([1, 1, 21322, 21322]) Position ids shape: torch.Size([1, 21322]) Input IDs shape: torch.Size([1, 21322]) Labels shape: torch.Size([1, 21322]) Final batch size: 1, sequence length: 24107 Attention mask shape: torch.Size([1, 1, 24107, 24107]) Position ids shape: torch.Size([1, 24107]) Input IDs shape: torch.Size([1, 24107]) Labels shape: torch.Size([1, 24107]) Final batch size: 1, sequence length: 26424 Attention mask shape: torch.Size([1, 1, 26424, 26424]) Position ids shape: torch.Size([1, 26424]) Input IDs shape: torch.Size([1, 26424]) Labels shape: torch.Size([1, 26424]) Final batch size: 1, sequence length: 25934 Attention mask shape: torch.Size([1, 1, 25934, 25934]) Position ids shape: torch.Size([1, 25934]) Input IDs shape: torch.Size([1, 25934]) Labels shape: torch.Size([1, 25934]) Final batch size: 1, sequence length: 14137 Attention mask shape: torch.Size([1, 1, 14137, 14137]) Position ids shape: torch.Size([1, 14137]) Input IDs shape: torch.Size([1, 14137]) Labels shape: torch.Size([1, 14137]) Final batch size: 1, sequence length: 28012 Attention mask shape: torch.Size([1, 1, 28012, 28012]) Position ids shape: torch.Size([1, 28012]) Input IDs shape: torch.Size([1, 28012]) Labels shape: torch.Size([1, 28012]) Final batch size: 1, sequence length: 12234 Attention mask shape: torch.Size([1, 1, 12234, 12234]) Position ids shape: torch.Size([1, 12234]) Input IDs shape: torch.Size([1, 12234]) Labels shape: torch.Size([1, 12234]) Final batch size: 1, sequence length: 23442 Attention mask shape: torch.Size([1, 1, 23442, 23442]) Position ids shape: torch.Size([1, 23442]) Input IDs shape: torch.Size([1, 23442]) Labels shape: torch.Size([1, 23442]) Final batch size: 1, sequence length: 27384 Attention mask shape: torch.Size([1, 1, 27384, 27384]) Position ids shape: torch.Size([1, 27384]) Input IDs shape: torch.Size([1, 27384]) Labels shape: torch.Size([1, 27384]) Final batch size: 1, sequence length: 22851 Attention mask shape: torch.Size([1, 1, 22851, 22851]) Position ids shape: torch.Size([1, 22851]) Input IDs shape: torch.Size([1, 22851]) Labels shape: torch.Size([1, 22851]) Final batch size: 1, sequence length: 21853 Attention mask shape: torch.Size([1, 1, 21853, 21853]) Position ids shape: torch.Size([1, 21853]) Input IDs shape: torch.Size([1, 21853]) Labels shape: torch.Size([1, 21853]) Final batch size: 1, sequence length: 26454 Attention mask shape: torch.Size([1, 1, 26454, 26454]) Position ids shape: torch.Size([1, 26454]) Input IDs shape: torch.Size([1, 26454]) Labels shape: torch.Size([1, 26454]) Final batch size: 1, sequence length: 28474 Attention mask shape: torch.Size([1, 1, 28474, 28474]) Position ids shape: torch.Size([1, 28474]) Input IDs shape: torch.Size([1, 28474]) Labels shape: torch.Size([1, 28474]) Final batch size: 1, sequence length: 22920 Attention mask shape: torch.Size([1, 1, 22920, 22920]) Position ids shape: torch.Size([1, 22920]) Input IDs shape: torch.Size([1, 22920]) Labels shape: torch.Size([1, 22920]) Final batch size: 1, sequence length: 24111 Attention mask shape: torch.Size([1, 1, 24111, 24111]) Position ids shape: torch.Size([1, 24111]) Input IDs shape: torch.Size([1, 24111]) Labels shape: torch.Size([1, 24111]) Final batch size: 1, sequence length: 26500 Attention mask shape: torch.Size([1, 1, 26500, 26500]) Position ids shape: torch.Size([1, 26500]) Input IDs shape: torch.Size([1, 26500]) Labels shape: torch.Size([1, 26500]) Final batch size: 1, sequence length: 24364 Attention mask shape: torch.Size([1, 1, 24364, 24364]) Position ids shape: torch.Size([1, 24364]) Input IDs shape: torch.Size([1, 24364]) Labels shape: torch.Size([1, 24364]) Final batch size: 1, sequence length: 30235 Attention mask shape: torch.Size([1, 1, 30235, 30235]) Position ids shape: torch.Size([1, 30235]) Input IDs shape: torch.Size([1, 30235]) Labels shape: torch.Size([1, 30235]) Final batch size: 1, sequence length: 24716 Attention mask shape: torch.Size([1, 1, 24716, 24716]) Position ids shape: torch.Size([1, 24716]) Input IDs shape: torch.Size([1, 24716]) Labels shape: torch.Size([1, 24716]) Final batch size: 1, sequence length: 20314 Attention mask shape: torch.Size([1, 1, 20314, 20314]) Position ids shape: torch.Size([1, 20314]) Input IDs shape: torch.Size([1, 20314]) Labels shape: torch.Size([1, 20314]) Final batch size: 1, sequence length: 19027 Attention mask shape: torch.Size([1, 1, 19027, 19027]) Position ids shape: torch.Size([1, 19027]) Input IDs shape: torch.Size([1, 19027]) Labels shape: torch.Size([1, 19027]) Final batch size: 1, sequence length: 27520 Attention mask shape: torch.Size([1, 1, 27520, 27520]) Position ids shape: torch.Size([1, 27520]) Input IDs shape: torch.Size([1, 27520]) Labels shape: torch.Size([1, 27520]) Final batch size: 1, sequence length: 11662 Attention mask shape: torch.Size([1, 1, 11662, 11662]) Position ids shape: torch.Size([1, 11662]) Input IDs shape: torch.Size([1, 11662]) Labels shape: torch.Size([1, 11662]) Final batch size: 1, sequence length: 20028 Attention mask shape: torch.Size([1, 1, 20028, 20028]) Position ids shape: torch.Size([1, 20028]) Input IDs shape: torch.Size([1, 20028]) Labels shape: torch.Size([1, 20028]) Final batch size: 1, sequence length: 29842 Attention mask shape: torch.Size([1, 1, 29842, 29842]) Position ids shape: torch.Size([1, 29842]) Input IDs shape: torch.Size([1, 29842]) Labels shape: torch.Size([1, 29842]) Final batch size: 1, sequence length: 12519 Attention mask shape: torch.Size([1, 1, 12519, 12519]) Position ids shape: torch.Size([1, 12519]) Input IDs shape: torch.Size([1, 12519]) Labels shape: torch.Size([1, 12519]) Final batch size: 1, sequence length: 24830 Attention mask shape: torch.Size([1, 1, 24830, 24830]) Position ids shape: torch.Size([1, 24830]) Input IDs shape: torch.Size([1, 24830]) Labels shape: torch.Size([1, 24830]) Final batch size: 1, sequence length: 23855 Attention mask shape: torch.Size([1, 1, 23855, 23855]) Position ids shape: torch.Size([1, 23855]) Input IDs shape: torch.Size([1, 23855]) Labels shape: torch.Size([1, 23855]) Final batch size: 1, sequence length: 33179 Attention mask shape: torch.Size([1, 1, 33179, 33179]) Position ids shape: torch.Size([1, 33179]) Input IDs shape: torch.Size([1, 33179]) Labels shape: torch.Size([1, 33179]) Final batch size: 1, sequence length: 30041 Attention mask shape: torch.Size([1, 1, 30041, 30041]) Position ids shape: torch.Size([1, 30041]) Input IDs shape: torch.Size([1, 30041]) Labels shape: torch.Size([1, 30041]) Final batch size: 1, sequence length: 24923 Attention mask shape: torch.Size([1, 1, 24923, 24923]) Position ids shape: torch.Size([1, 24923]) Input IDs shape: torch.Size([1, 24923]) Labels shape: torch.Size([1, 24923]) Final batch size: 1, sequence length: 31404 Attention mask shape: torch.Size([1, 1, 31404, 31404]) Position ids shape: torch.Size([1, 31404]) Input IDs shape: torch.Size([1, 31404]) Labels shape: torch.Size([1, 31404]) Final batch size: 1, sequence length: 24187 Attention mask shape: torch.Size([1, 1, 24187, 24187]) Position ids shape: torch.Size([1, 24187]) Input IDs shape: torch.Size([1, 24187]) Labels shape: torch.Size([1, 24187]) Final batch size: 1, sequence length: 33321 Attention mask shape: torch.Size([1, 1, 33321, 33321]) Position ids shape: torch.Size([1, 33321]) Input IDs shape: torch.Size([1, 33321]) Labels shape: torch.Size([1, 33321]) Final batch size: 1, sequence length: 19302 Attention mask shape: torch.Size([1, 1, 19302, 19302]) Position ids shape: torch.Size([1, 19302]) Input IDs shape: torch.Size([1, 19302]) Labels shape: torch.Size([1, 19302]) Final batch size: 1, sequence length: 33765 Attention mask shape: torch.Size([1, 1, 33765, 33765]) Position ids shape: torch.Size([1, 33765]) Input IDs shape: torch.Size([1, 33765]) Labels shape: torch.Size([1, 33765]) Final batch size: 1, sequence length: 21168 Attention mask shape: torch.Size([1, 1, 21168, 21168]) Position ids shape: torch.Size([1, 21168]) Input IDs shape: torch.Size([1, 21168]) Labels shape: torch.Size([1, 21168]) Final batch size: 1, sequence length: 23695 Attention mask shape: torch.Size([1, 1, 23695, 23695]) Position ids shape: torch.Size([1, 23695]) Input IDs shape: torch.Size([1, 23695]) Labels shape: torch.Size([1, 23695]) Final batch size: 1, sequence length: 29743 Attention mask shape: torch.Size([1, 1, 29743, 29743]) Position ids shape: torch.Size([1, 29743]) Input IDs shape: torch.Size([1, 29743]) Labels shape: torch.Size([1, 29743]) Final batch size: 1, sequence length: 35555 Attention mask shape: torch.Size([1, 1, 35555, 35555]) Position ids shape: torch.Size([1, 35555]) Input IDs shape: torch.Size([1, 35555]) Labels shape: torch.Size([1, 35555]) Final batch size: 1, sequence length: 29385 Attention mask shape: torch.Size([1, 1, 29385, 29385]) Position ids shape: torch.Size([1, 29385]) Input IDs shape: torch.Size([1, 29385]) Labels shape: torch.Size([1, 29385]) Final batch size: 1, sequence length: 34169 Attention mask shape: torch.Size([1, 1, 34169, 34169]) Position ids shape: torch.Size([1, 34169]) Input IDs shape: torch.Size([1, 34169]) Labels shape: torch.Size([1, 34169]) Final batch size: 1, sequence length: 27107 Attention mask shape: torch.Size([1, 1, 27107, 27107]) Position ids shape: torch.Size([1, 27107]) Input IDs shape: torch.Size([1, 27107]) Labels shape: torch.Size([1, 27107]) Final batch size: 1, sequence length: 17763 Attention mask shape: torch.Size([1, 1, 17763, 17763]) Position ids shape: torch.Size([1, 17763]) Input IDs shape: torch.Size([1, 17763]) Labels shape: torch.Size([1, 17763]) Final batch size: 1, sequence length: 27738 Attention mask shape: torch.Size([1, 1, 27738, 27738]) Position ids shape: torch.Size([1, 27738]) Input IDs shape: torch.Size([1, 27738]) Labels shape: torch.Size([1, 27738]) Final batch size: 1, sequence length: 37657 Attention mask shape: torch.Size([1, 1, 37657, 37657]) Position ids shape: torch.Size([1, 37657]) Input IDs shape: torch.Size([1, 37657]) Labels shape: torch.Size([1, 37657]) Final batch size: 1, sequence length: 38576 Attention mask shape: torch.Size([1, 1, 38576, 38576]) Position ids shape: torch.Size([1, 38576]) Input IDs shape: torch.Size([1, 38576]) Labels shape: torch.Size([1, 38576]) Final batch size: 1, sequence length: 10139 Attention mask shape: torch.Size([1, 1, 10139, 10139]) Position ids shape: torch.Size([1, 10139]) Input IDs shape: torch.Size([1, 10139]) Labels shape: torch.Size([1, 10139]) Final batch size: 1, sequence length: 32217 Attention mask shape: torch.Size([1, 1, 32217, 32217]) Position ids shape: torch.Size([1, 32217]) Input IDs shape: torch.Size([1, 32217]) Labels shape: torch.Size([1, 32217]) Final batch size: 1, sequence length: 40006 Attention mask shape: torch.Size([1, 1, 40006, 40006]) Position ids shape: torch.Size([1, 40006]) Input IDs shape: torch.Size([1, 40006]) Labels shape: torch.Size([1, 40006]) Final batch size: 1, sequence length: 26696 Attention mask shape: torch.Size([1, 1, 26696, 26696]) Position ids shape: torch.Size([1, 26696]) Input IDs shape: torch.Size([1, 26696]) Labels shape: torch.Size([1, 26696]) Final batch size: 1, sequence length: 26685 Attention mask shape: torch.Size([1, 1, 26685, 26685]) Position ids shape: torch.Size([1, 26685]) Input IDs shape: torch.Size([1, 26685]) Labels shape: torch.Size([1, 26685]) Final batch size: 1, sequence length: 31115 Attention mask shape: torch.Size([1, 1, 31115, 31115]) Position ids shape: torch.Size([1, 31115]) Input IDs shape: torch.Size([1, 31115]) Labels shape: torch.Size([1, 31115]) Final batch size: 1, sequence length: 29085 Attention mask shape: torch.Size([1, 1, 29085, 29085]) Position ids shape: torch.Size([1, 29085]) Input IDs shape: torch.Size([1, 29085]) Labels shape: torch.Size([1, 29085]) Final batch size: 1, sequence length: 18734 Attention mask shape: torch.Size([1, 1, 18734, 18734]) Position ids shape: torch.Size([1, 18734]) Input IDs shape: torch.Size([1, 18734]) Labels shape: torch.Size([1, 18734]) Final batch size: 1, sequence length: 29286 Attention mask shape: torch.Size([1, 1, 29286, 29286]) Position ids shape: torch.Size([1, 29286]) Input IDs shape: torch.Size([1, 29286]) Labels shape: torch.Size([1, 29286]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 23848 Attention mask shape: torch.Size([1, 1, 23848, 23848]) Position ids shape: torch.Size([1, 23848]) Input IDs shape: torch.Size([1, 23848]) Labels shape: torch.Size([1, 23848]) Final batch size: 1, sequence length: 19138 Attention mask shape: torch.Size([1, 1, 19138, 19138]) Position ids shape: torch.Size([1, 19138]) Input IDs shape: torch.Size([1, 19138]) Labels shape: torch.Size([1, 19138]) Final batch size: 1, sequence length: 35949 Attention mask shape: torch.Size([1, 1, 35949, 35949]) Position ids shape: torch.Size([1, 35949]) Input IDs shape: torch.Size([1, 35949]) Labels shape: torch.Size([1, 35949]) Final batch size: 1, sequence length: 31991 Attention mask shape: torch.Size([1, 1, 31991, 31991]) Position ids shape: torch.Size([1, 31991]) Input IDs shape: torch.Size([1, 31991]) Labels shape: torch.Size([1, 31991]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 39018 Attention mask shape: torch.Size([1, 1, 39018, 39018]) Position ids shape: torch.Size([1, 39018]) Input IDs shape: torch.Size([1, 39018]) Labels shape: torch.Size([1, 39018]) Final batch size: 1, sequence length: 22500 Attention mask shape: torch.Size([1, 1, 22500, 22500]) Position ids shape: torch.Size([1, 22500]) Input IDs shape: torch.Size([1, 22500]) Labels shape: torch.Size([1, 22500]) Final batch size: 1, sequence length: 25990 Attention mask shape: torch.Size([1, 1, 25990, 25990]) Position ids shape: torch.Size([1, 25990]) Input IDs shape: torch.Size([1, 25990]) Labels shape: torch.Size([1, 25990]) Final batch size: 1, sequence length: 32762 Attention mask shape: torch.Size([1, 1, 32762, 32762]) Position ids shape: torch.Size([1, 32762]) Input IDs shape: torch.Size([1, 32762]) Labels shape: torch.Size([1, 32762]) Final batch size: 1, sequence length: 37680 Attention mask shape: torch.Size([1, 1, 37680, 37680]) Position ids shape: torch.Size([1, 37680]) Input IDs shape: torch.Size([1, 37680]) Labels shape: torch.Size([1, 37680]) Final batch size: 1, sequence length: 34628 Attention mask shape: torch.Size([1, 1, 34628, 34628]) Position ids shape: torch.Size([1, 34628]) Input IDs shape: torch.Size([1, 34628]) Labels shape: torch.Size([1, 34628]) Final batch size: 1, sequence length: 40834 Attention mask shape: torch.Size([1, 1, 40834, 40834]) Position ids shape: torch.Size([1, 40834]) Input IDs shape: torch.Size([1, 40834]) Labels shape: torch.Size([1, 40834]) Final batch size: 1, sequence length: 22778 Attention mask shape: torch.Size([1, 1, 22778, 22778]) Position ids shape: torch.Size([1, 22778]) Input IDs shape: torch.Size([1, 22778]) Labels shape: torch.Size([1, 22778]) Final batch size: 1, sequence length: 32675 Attention mask shape: torch.Size([1, 1, 32675, 32675]) Position ids shape: torch.Size([1, 32675]) Input IDs shape: torch.Size([1, 32675]) Labels shape: torch.Size([1, 32675]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 26372 Attention mask shape: torch.Size([1, 1, 26372, 26372]) Position ids shape: torch.Size([1, 26372]) Input IDs shape: torch.Size([1, 26372]) Labels shape: torch.Size([1, 26372]) Final batch size: 1, sequence length: 34791 Attention mask shape: torch.Size([1, 1, 34791, 34791]) Position ids shape: torch.Size([1, 34791]) Input IDs shape: torch.Size([1, 34791]) Labels shape: torch.Size([1, 34791]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) {'loss': 0.245, 'grad_norm': 0.13815342651553436, 'learning_rate': 2.7390523158633552e-08, 'num_tokens': -inf, 'epoch': 7.88} Final batch size: 1, sequence length: 5686 Attention mask shape: torch.Size([1, 1, 5686, 5686]) Position ids shape: torch.Size([1, 5686]) Input IDs shape: torch.Size([1, 5686]) Labels shape: torch.Size([1, 5686]) Final batch size: 1, sequence length: 6986 Attention mask shape: torch.Size([1, 1, 6986, 6986]) Position ids shape: torch.Size([1, 6986]) Input IDs shape: torch.Size([1, 6986]) Labels shape: torch.Size([1, 6986]) Final batch size: 1, sequence length: 12847 Attention mask shape: torch.Size([1, 1, 12847, 12847]) Position ids shape: torch.Size([1, 12847]) Input IDs shape: torch.Size([1, 12847]) Labels shape: torch.Size([1, 12847]) Final batch size: 1, sequence length: 11304 Attention mask shape: torch.Size([1, 1, 11304, 11304]) Position ids shape: torch.Size([1, 11304]) Input IDs shape: torch.Size([1, 11304]) Labels shape: torch.Size([1, 11304]) Final batch size: 1, sequence length: 14168 Attention mask shape: torch.Size([1, 1, 14168, 14168]) Position ids shape: torch.Size([1, 14168]) Input IDs shape: torch.Size([1, 14168]) Labels shape: torch.Size([1, 14168]) Final batch size: 1, sequence length: 14304 Attention mask shape: torch.Size([1, 1, 14304, 14304]) Position ids shape: torch.Size([1, 14304]) Input IDs shape: torch.Size([1, 14304]) Labels shape: torch.Size([1, 14304]) Final batch size: 1, sequence length: 15621 Attention mask shape: torch.Size([1, 1, 15621, 15621]) Position ids shape: torch.Size([1, 15621]) Input IDs shape: torch.Size([1, 15621]) Labels shape: torch.Size([1, 15621]) Final batch size: 1, sequence length: 13872 Attention mask shape: torch.Size([1, 1, 13872, 13872]) Position ids shape: torch.Size([1, 13872]) Input IDs shape: torch.Size([1, 13872]) Labels shape: torch.Size([1, 13872]) Final batch size: 1, sequence length: 13224 Attention mask shape: torch.Size([1, 1, 13224, 13224]) Position ids shape: torch.Size([1, 13224]) Input IDs shape: torch.Size([1, 13224]) Labels shape: torch.Size([1, 13224]) Final batch size: 1, sequence length: 16514 Attention mask shape: torch.Size([1, 1, 16514, 16514]) Position ids shape: torch.Size([1, 16514]) Input IDs shape: torch.Size([1, 16514]) Labels shape: torch.Size([1, 16514]) Final batch size: 1, sequence length: 15652 Attention mask shape: torch.Size([1, 1, 15652, 15652]) Position ids shape: torch.Size([1, 15652]) Input IDs shape: torch.Size([1, 15652]) Labels shape: torch.Size([1, 15652]) Final batch size: 1, sequence length: 18246 Attention mask shape: torch.Size([1, 1, 18246, 18246]) Position ids shape: torch.Size([1, 18246]) Input IDs shape: torch.Size([1, 18246]) Labels shape: torch.Size([1, 18246]) Final batch size: 1, sequence length: 17213 Attention mask shape: torch.Size([1, 1, 17213, 17213]) Position ids shape: torch.Size([1, 17213]) Input IDs shape: torch.Size([1, 17213]) Labels shape: torch.Size([1, 17213]) Final batch size: 1, sequence length: 17773 Attention mask shape: torch.Size([1, 1, 17773, 17773]) Position ids shape: torch.Size([1, 17773]) Input IDs shape: torch.Size([1, 17773]) Labels shape: torch.Size([1, 17773]) Final batch size: 1, sequence length: 18044 Attention mask shape: torch.Size([1, 1, 18044, 18044]) Position ids shape: torch.Size([1, 18044]) Input IDs shape: torch.Size([1, 18044]) Labels shape: torch.Size([1, 18044]) Final batch size: 1, sequence length: 16501 Attention mask shape: torch.Size([1, 1, 16501, 16501]) Position ids shape: torch.Size([1, 16501]) Input IDs shape: torch.Size([1, 16501]) Labels shape: torch.Size([1, 16501]) Final batch size: 1, sequence length: 16440 Attention mask shape: torch.Size([1, 1, 16440, 16440]) Position ids shape: torch.Size([1, 16440]) Input IDs shape: torch.Size([1, 16440]) Labels shape: torch.Size([1, 16440]) Final batch size: 1, sequence length: 16378 Attention mask shape: torch.Size([1, 1, 16378, 16378]) Position ids shape: torch.Size([1, 16378]) Input IDs shape: torch.Size([1, 16378]) Labels shape: torch.Size([1, 16378]) Final batch size: 1, sequence length: 21163 Attention mask shape: torch.Size([1, 1, 21163, 21163]) Position ids shape: torch.Size([1, 21163]) Input IDs shape: torch.Size([1, 21163]) Labels shape: torch.Size([1, 21163]) Final batch size: 1, sequence length: 18155 Attention mask shape: torch.Size([1, 1, 18155, 18155]) Position ids shape: torch.Size([1, 18155]) Input IDs shape: torch.Size([1, 18155]) Labels shape: torch.Size([1, 18155]) Final batch size: 1, sequence length: 20176 Attention mask shape: torch.Size([1, 1, 20176, 20176]) Position ids shape: torch.Size([1, 20176]) Input IDs shape: torch.Size([1, 20176]) Labels shape: torch.Size([1, 20176]) Final batch size: 1, sequence length: 17852 Attention mask shape: torch.Size([1, 1, 17852, 17852]) Position ids shape: torch.Size([1, 17852]) Input IDs shape: torch.Size([1, 17852]) Labels shape: torch.Size([1, 17852]) Final batch size: 1, sequence length: 18289 Attention mask shape: torch.Size([1, 1, 18289, 18289]) Position ids shape: torch.Size([1, 18289]) Input IDs shape: torch.Size([1, 18289]) Labels shape: torch.Size([1, 18289]) Final batch size: 1, sequence length: 20813 Attention mask shape: torch.Size([1, 1, 20813, 20813]) Position ids shape: torch.Size([1, 20813]) Input IDs shape: torch.Size([1, 20813]) Labels shape: torch.Size([1, 20813]) Final batch size: 1, sequence length: 20766 Attention mask shape: torch.Size([1, 1, 20766, 20766]) Position ids shape: torch.Size([1, 20766]) Input IDs shape: torch.Size([1, 20766]) Labels shape: torch.Size([1, 20766]) Final batch size: 1, sequence length: 21207 Attention mask shape: torch.Size([1, 1, 21207, 21207]) Position ids shape: torch.Size([1, 21207]) Input IDs shape: torch.Size([1, 21207]) Labels shape: torch.Size([1, 21207]) Final batch size: 1, sequence length: 21316 Attention mask shape: torch.Size([1, 1, 21316, 21316]) Position ids shape: torch.Size([1, 21316]) Input IDs shape: torch.Size([1, 21316]) Labels shape: torch.Size([1, 21316]) Final batch size: 1, sequence length: 22152 Attention mask shape: torch.Size([1, 1, 22152, 22152]) Position ids shape: torch.Size([1, 22152]) Input IDs shape: torch.Size([1, 22152]) Labels shape: torch.Size([1, 22152]) Final batch size: 1, sequence length: 22117 Attention mask shape: torch.Size([1, 1, 22117, 22117]) Position ids shape: torch.Size([1, 22117]) Input IDs shape: torch.Size([1, 22117]) Labels shape: torch.Size([1, 22117]) Final batch size: 1, sequence length: 21607 Attention mask shape: torch.Size([1, 1, 21607, 21607]) Position ids shape: torch.Size([1, 21607]) Input IDs shape: torch.Size([1, 21607]) Labels shape: torch.Size([1, 21607]) Final batch size: 1, sequence length: 22784 Attention mask shape: torch.Size([1, 1, 22784, 22784]) Position ids shape: torch.Size([1, 22784]) Input IDs shape: torch.Size([1, 22784]) Labels shape: torch.Size([1, 22784]) Final batch size: 1, sequence length: 22197 Attention mask shape: torch.Size([1, 1, 22197, 22197]) Position ids shape: torch.Size([1, 22197]) Input IDs shape: torch.Size([1, 22197]) Labels shape: torch.Size([1, 22197]) Final batch size: 1, sequence length: 21705 Attention mask shape: torch.Size([1, 1, 21705, 21705]) Position ids shape: torch.Size([1, 21705]) Input IDs shape: torch.Size([1, 21705]) Labels shape: torch.Size([1, 21705]) Final batch size: 1, sequence length: 25880 Attention mask shape: torch.Size([1, 1, 25880, 25880]) Position ids shape: torch.Size([1, 25880]) Input IDs shape: torch.Size([1, 25880]) Labels shape: torch.Size([1, 25880]) Final batch size: 1, sequence length: 26494 Attention mask shape: torch.Size([1, 1, 26494, 26494]) Position ids shape: torch.Size([1, 26494]) Input IDs shape: torch.Size([1, 26494]) Labels shape: torch.Size([1, 26494]) Final batch size: 1, sequence length: 25053 Attention mask shape: torch.Size([1, 1, 25053, 25053]) Position ids shape: torch.Size([1, 25053]) Input IDs shape: torch.Size([1, 25053]) Labels shape: torch.Size([1, 25053]) Final batch size: 1, sequence length: 26303 Final batch size: 1, sequence length: 21562 Attention mask shape: torch.Size([1, 1, 21562, 21562]) Attention mask shape: torch.Size([1, 1, 26303, 26303]) Position ids shape: torch.Size([1, 26303]) Input IDs shape: torch.Size([1, 26303]) Labels shape: torch.Size([1, 26303]) Position ids shape: torch.Size([1, 21562]) Input IDs shape: torch.Size([1, 21562]) Labels shape: torch.Size([1, 21562]) Final batch size: 1, sequence length: 26921 Attention mask shape: torch.Size([1, 1, 26921, 26921]) Position ids shape: torch.Size([1, 26921]) Input IDs shape: torch.Size([1, 26921]) Labels shape: torch.Size([1, 26921]) Final batch size: 1, sequence length: 27005 Attention mask shape: torch.Size([1, 1, 27005, 27005]) Position ids shape: torch.Size([1, 27005]) Input IDs shape: torch.Size([1, 27005]) Labels shape: torch.Size([1, 27005]) Final batch size: 1, sequence length: 30151 Attention mask shape: torch.Size([1, 1, 30151, 30151]) Position ids shape: torch.Size([1, 30151]) Input IDs shape: torch.Size([1, 30151]) Labels shape: torch.Size([1, 30151]) Final batch size: 1, sequence length: 28523 Attention mask shape: torch.Size([1, 1, 28523, 28523]) Position ids shape: torch.Size([1, 28523]) Input IDs shape: torch.Size([1, 28523]) Labels shape: torch.Size([1, 28523]) Final batch size: 1, sequence length: 30592 Attention mask shape: torch.Size([1, 1, 30592, 30592]) Position ids shape: torch.Size([1, 30592]) Input IDs shape: torch.Size([1, 30592]) Labels shape: torch.Size([1, 30592]) Final batch size: 1, sequence length: 28897 Attention mask shape: torch.Size([1, 1, 28897, 28897]) Position ids shape: torch.Size([1, 28897]) Input IDs shape: torch.Size([1, 28897]) Labels shape: torch.Size([1, 28897]) Final batch size: 1, sequence length: 32473 Attention mask shape: torch.Size([1, 1, 32473, 32473]) Position ids shape: torch.Size([1, 32473]) Input IDs shape: torch.Size([1, 32473]) Labels shape: torch.Size([1, 32473]) Final batch size: 1, sequence length: 33482 Attention mask shape: torch.Size([1, 1, 33482, 33482]) Position ids shape: torch.Size([1, 33482]) Input IDs shape: torch.Size([1, 33482]) Labels shape: torch.Size([1, 33482]) Final batch size: 1, sequence length: 32596 Attention mask shape: torch.Size([1, 1, 32596, 32596]) Position ids shape: torch.Size([1, 32596]) Input IDs shape: torch.Size([1, 32596]) Labels shape: torch.Size([1, 32596]) Final batch size: 1, sequence length: 34469 Attention mask shape: torch.Size([1, 1, 34469, 34469]) Position ids shape: torch.Size([1, 34469]) Input IDs shape: torch.Size([1, 34469]) Labels shape: torch.Size([1, 34469]) Final batch size: 1, sequence length: 35030 Attention mask shape: torch.Size([1, 1, 35030, 35030]) Position ids shape: torch.Size([1, 35030]) Input IDs shape: torch.Size([1, 35030]) Labels shape: torch.Size([1, 35030]) Final batch size: 1, sequence length: 35100 Attention mask shape: torch.Size([1, 1, 35100, 35100]) Position ids shape: torch.Size([1, 35100]) Input IDs shape: torch.Size([1, 35100]) Labels shape: torch.Size([1, 35100]) Final batch size: 1, sequence length: 35485 Attention mask shape: torch.Size([1, 1, 35485, 35485]) Position ids shape: torch.Size([1, 35485]) Input IDs shape: torch.Size([1, 35485]) Labels shape: torch.Size([1, 35485]) Final batch size: 1, sequence length: 35523 Attention mask shape: torch.Size([1, 1, 35523, 35523]) Position ids shape: torch.Size([1, 35523]) Input IDs shape: torch.Size([1, 35523]) Labels shape: torch.Size([1, 35523]) Final batch size: 1, sequence length: 35866 Attention mask shape: torch.Size([1, 1, 35866, 35866]) Position ids shape: torch.Size([1, 35866]) Input IDs shape: torch.Size([1, 35866]) Labels shape: torch.Size([1, 35866]) Final batch size: 1, sequence length: 37980 Attention mask shape: torch.Size([1, 1, 37980, 37980]) Position ids shape: torch.Size([1, 37980]) Input IDs shape: torch.Size([1, 37980]) Labels shape: torch.Size([1, 37980]) Final batch size: 1, sequence length: 39589 Attention mask shape: torch.Size([1, 1, 39589, 39589]) Position ids shape: torch.Size([1, 39589]) Input IDs shape: torch.Size([1, 39589]) Labels shape: torch.Size([1, 39589]) Final batch size: 1, sequence length: 40526 Attention mask shape: torch.Size([1, 1, 40526, 40526]) Position ids shape: torch.Size([1, 40526]) Input IDs shape: torch.Size([1, 40526]) Labels shape: torch.Size([1, 40526]) Final batch size: 1, sequence length: 37040 Attention mask shape: torch.Size([1, 1, 37040, 37040]) Position ids shape: torch.Size([1, 37040]) Input IDs shape: torch.Size([1, 37040]) Labels shape: torch.Size([1, 37040]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 38927 Attention mask shape: torch.Size([1, 1, 38927, 38927]) Position ids shape: torch.Size([1, 38927]) Input IDs shape: torch.Size([1, 38927]) Labels shape: torch.Size([1, 38927]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) Final batch size: 1, sequence length: 40960 Attention mask shape: torch.Size([1, 1, 40960, 40960]) Position ids shape: torch.Size([1, 40960]) Input IDs shape: torch.Size([1, 40960]) Labels shape: torch.Size([1, 40960]) {'loss': 0.2378, 'grad_norm': 0.14807342253741837, 'learning_rate': 6.852326227130835e-09, 'num_tokens': -inf, 'epoch': 8.0} {'train_runtime': 9811.2832, 'train_samples_per_second': 0.786, 'train_steps_per_second': 0.007, 'train_loss': 0.28897070651873946, 'epoch': 8.0} wandb: wandb: 🚀 View run runs/dev/tw-data-train_final_replaced_from_classified-fix-format-8node-resume at: https://wandb.ai/ligeng-zhu/ThreadWeaver/runs/tw-data-train_final_replaced_from_classified-fix-format-8node-resume