SLURM_JOB_ID = 9168643 SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume RUN_NAME = tw-8node-resume OUTPUT_DIR = runs/dev/tw-8node-resume NNODES = 8 NODES = batch-block1-1015 batch-block1-3297 batch-block1-3887 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3227 NODE_RANK = 2 GPUS_PER_NODE = 8 MASTER_ADDR = batch-block1-1015 MASTER_PORT = 25001 GLOBAL_TRAIN_BATCH_SIZE = GRADIENT_ACCUMULATION_STEPS = PER_DEVICE_TRAIN_BATCH_SIZE = Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64 SLURM_JOB_ID = 9168643 SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume RUN_NAME = tw-8node-resume OUTPUT_DIR = runs/dev/tw-8node-resume NNODES = 8 SLURM_JOB_ID = 9168643 SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume RUN_NAME = tw-8node-resume OUTPUT_DIR = runs/dev/tw-8node-resume NNODES = 8 SLURM_JOB_ID = 9168643 SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume RUN_NAME = tw-8node-resume OUTPUT_DIR = runs/dev/tw-8node-resume NNODES = 8 SLURM_JOB_ID = 9168643 SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume RUN_NAME = tw-8node-resume OUTPUT_DIR = runs/dev/tw-8node-resume NNODES = 8 SLURM_JOB_ID = 9168643 SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume RUN_NAME = tw-8node-resume OUTPUT_DIR = runs/dev/tw-8node-resume NNODES = 8 SLURM_JOB_ID = 9168643 SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume RUN_NAME = tw-8node-resume OUTPUT_DIR = runs/dev/tw-8node-resume NNODES = 8 NODES = batch-block1-1015 batch-block1-3297 batch-block1-3887 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3227 NODE_RANK = 6 GPUS_PER_NODE = 8 SLURM_JOB_ID = 9168643 SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume RUN_NAME = tw-8node-resume OUTPUT_DIR = runs/dev/tw-8node-resume NNODES = 8 NODES = batch-block1-1015 batch-block1-3297 batch-block1-3887 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3227 NODE_RANK = 4 GPUS_PER_NODE = 8 NODES = batch-block1-1015 batch-block1-3297 batch-block1-3887 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3227 NODE_RANK = 5 GPUS_PER_NODE = 8 NODES = batch-block1-1015 batch-block1-3297 batch-block1-3887 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3227 NODE_RANK = 1 GPUS_PER_NODE = 8 NODES = batch-block1-1015 batch-block1-3297 batch-block1-3887 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3227 NODE_RANK = 7 GPUS_PER_NODE = 8 NODES = batch-block1-1015 batch-block1-3297 batch-block1-3887 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3227 NODE_RANK = 3 GPUS_PER_NODE = 8 MASTER_ADDR = batch-block1-1015 MASTER_PORT = 25001 GLOBAL_TRAIN_BATCH_SIZE = GRADIENT_ACCUMULATION_STEPS = PER_DEVICE_TRAIN_BATCH_SIZE = NODES = batch-block1-1015 batch-block1-3297 batch-block1-3887 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3227 NODE_RANK = 0 GPUS_PER_NODE = 8 MASTER_ADDR = batch-block1-1015 MASTER_PORT = 25001 GLOBAL_TRAIN_BATCH_SIZE = GRADIENT_ACCUMULATION_STEPS = PER_DEVICE_TRAIN_BATCH_SIZE = Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64 MASTER_ADDR = batch-block1-1015 MASTER_PORT = 25001 GLOBAL_TRAIN_BATCH_SIZE = GRADIENT_ACCUMULATION_STEPS = PER_DEVICE_TRAIN_BATCH_SIZE = MASTER_ADDR = batch-block1-1015 MASTER_PORT = 25001 GLOBAL_TRAIN_BATCH_SIZE = GRADIENT_ACCUMULATION_STEPS = PER_DEVICE_TRAIN_BATCH_SIZE = MASTER_ADDR = batch-block1-1015 MASTER_PORT = 25001 GLOBAL_TRAIN_BATCH_SIZE = GRADIENT_ACCUMULATION_STEPS = PER_DEVICE_TRAIN_BATCH_SIZE = Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64 MASTER_ADDR = batch-block1-1015 MASTER_PORT = 25001 GLOBAL_TRAIN_BATCH_SIZE = GRADIENT_ACCUMULATION_STEPS = PER_DEVICE_TRAIN_BATCH_SIZE = MASTER_ADDR = batch-block1-1015 MASTER_PORT = 25001 GLOBAL_TRAIN_BATCH_SIZE = GRADIENT_ACCUMULATION_STEPS = PER_DEVICE_TRAIN_BATCH_SIZE = Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64 Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64 Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64 Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64 Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1Imported prefix tree collator v1 Imported prefix tree collator v1Imported prefix tree collator v1 Imported prefix tree collator v1Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 [2026-04-13 23:07:56,557] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 23:07:56,558] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 23:07:56,616] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 23:07:56,617] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 23:07:56,643] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 23:07:56,643] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 23:07:56,731] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 23:07:56,747] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 23:07:56,760] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 23:07:56,762] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 23:07:56,785] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 23:07:56,787] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 23:07:56,787] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 23:07:56,792] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 23:07:56,800] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 23:07:56,806] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 23:07:56,920] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 23:07:56,927] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 23:07:56,955] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 23:07:57,003] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 23:07:57,028] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 23:07:57,036] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 23:07:57,071] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 23:07:57,082] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 23:07:57,082] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 23:07:57,083] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 23:07:57,083] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 23:07:57,083] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 23:07:57,084] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 23:07:57,085] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 23:07:57,085] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 23:07:57,086] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 23:07:57,087] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 23:07:57,088] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 23:07:57,092] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 23:07:57,093] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 23:07:57,098] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 23:07:57,120] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 23:07:57,122] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 23:07:57,125] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 23:07:57,140] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 23:07:57,142] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 23:07:57,147] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 23:07:57,163] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 23:07:57,167] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 23:07:57,170] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 23:07:57,172] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 23:07:57,175] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 23:07:57,175] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 23:07:57,179] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 23:07:57,185] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 23:07:57,204] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 23:07:57,217] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 23:07:57,219] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 23:07:57,230] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 23:07:57,237] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 23:08:05,010] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:08:05,025] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:08:05,053] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:08:05,070] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:08:05,071] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:08:05,113] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:08:05,253] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:08:05,279] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:08:05,373] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:08:05,378] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:08:05,432] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:08:05,464] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:08:05,500] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:08:05,548] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:08:05,609] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:08:05,613] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:08:05,629] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:08:05,656] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:08:05,660] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:08:05,691] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:08:05,693] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:08:05,693] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:08:05,694] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:08:05,740] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:08:05,782] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:08:05,795] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:08:05,806] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:08:05,823] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:08:05,831] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:08:05,841] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:08:05,845] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:08:05,847] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:08:05,850] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:08:05,859] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:08:05,859] [INFO] [comm.py:706:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl [2026-04-13 23:08:05,881] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:08:05,893] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:08:05,948] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:08:05,965] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:08:05,970] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:08:05,989] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:08:06,001] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:08:06,007] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:08:06,011] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:08:06,013] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:08:06,039] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:08:06,078] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:08:06,118] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:08:06,128] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:08:06,129] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:08:06,131] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:08:06,132] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:08:06,145] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:08:06,193] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:08:06,197] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:08:06,240] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:08:06,257] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:08:06,371] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:08:06,391] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:08:06,431] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:08:06,444] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:08:06,492] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:08:06,500] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:08:06,506] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:08:06,510] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:08:06,514] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:08:06,518] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:08:06,518] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:08:06,518] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:08:06,520] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:08:06,523] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:08:06,528] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:08:06,529] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:08:06,540] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:08:06,546] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:08:06,550] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:08:06,551] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:08:06,555] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:08:06,579] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:08:06,609] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:08:06,630] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:08:06,645] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:08:06,697] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:08:06,714] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:08:06,714] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:08:06,732] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:08:06,735] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:08:06,740] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:08:06,740] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:08:06,755] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:08:13,160] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:08:13,184] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:08:13,184] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:08:13,185] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:08:13,224] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:08:13,239] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:08:13,240] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:08:13,243] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:08:13,243] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:08:13,245] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:08:13,254] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:08:13,264] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:08:13,272] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:08:13,291] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:08:13,452] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:08:13,457] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:08:17,477] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:08:17,531] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:08:17,547] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:08:17,548] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:08:17,558] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:08:17,645] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:08:17,648] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:08:18,097] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 23:08:18,097] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 23:08:18,097] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 23:08:18,097] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 23:08:18,097] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 23:08:18,097] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 23:08:18,097] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 23:08:18,098] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 23:08:30,097] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:08:30,098] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:08:30,098] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:08:30,099] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:08:30,109] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:08:30,110] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:08:30,110] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:08:30,110] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:08:32,677] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:08:32,677] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:08:32,677] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:08:32,677] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:08:32,677] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:08:32,677] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:08:32,677] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:08:32,677] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:08:36,881] [INFO] [partition_parameters.py:348:__exit__] finished initializing model - num_params = 399, num_elems = 8.19B Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Prefix Tree collator args.report_to: ['wandb'] Using Prefix Tree collator args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3273 args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1134 Using Prefix Tree collator Using Prefix Tree collator args.report_to: ['wandb'] args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-3297 args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3848 Using Prefix Tree collator Using Prefix Tree collator Using Prefix Tree collator Using Prefix Tree collator args.report_to: ['wandb'] args.report_to: ['wandb'] Using Prefix Tree collator args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3848 Using Prefix Tree collator args.report_to: ['wandb']Using Prefix Tree collator args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1134 args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-1134 args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1015 args.report_to: ['wandb'] Using Prefix Tree collator args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1027 args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3273 args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-1015 Using Prefix Tree collator args.report_to: ['wandb'] Using Prefix Tree collator args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-3297 args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-1015 Using Prefix Tree collator Using Prefix Tree collator args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3848 Using Prefix Tree collator Using Prefix Tree collator args.report_to: ['wandb'] args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3848 args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3273 Using Prefix Tree collator args.report_to: ['wandb'] args.report_to: ['wandb'] args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-3297 args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1134 args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3273 Using Prefix Tree collator Using Prefix Tree collator args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-1015 args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3273 Using Prefix Tree collator Using Prefix Tree collator Using Prefix Tree collator Using Prefix Tree collator Using Prefix Tree collator args.report_to: ['wandb'] args.report_to: ['wandb'] Using Prefix Tree collator args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3273 args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3273 args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-3297 args.report_to: ['wandb'] Using Prefix Tree collator args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-1134 args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3848 args.report_to: ['wandb'] Using Prefix Tree collator args.report_to: ['wandb'] Using Prefix Tree collator args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-1015 Using Prefix Tree collator args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3848 args.report_to: ['wandb'] args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1134 args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3227 args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3297 Using Prefix Tree collator Using Prefix Tree collator Using Prefix Tree collator Using Prefix Tree collator Using Prefix Tree collator args.report_to: ['wandb'] args.report_to: ['wandb'] args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1027 Using Prefix Tree collator Using Prefix Tree collator Using Prefix Tree collator args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3297 args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3848 args.report_to: ['wandb'] args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3227 args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-1015 args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1027 args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3273 args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3227 Using Prefix Tree collator args.report_to: ['wandb'] Using Prefix Tree collator args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3227 Using Prefix Tree collator Using Prefix Tree collator Using Prefix Tree collator args.report_to: ['wandb'] Using Prefix Tree collator Using Prefix Tree collator args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-3297 Using Prefix Tree collator args.report_to: ['wandb'] args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1027 args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1027 args.report_to: ['wandb'] args.report_to: ['wandb'] Using Prefix Tree collator args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1027 args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3227 args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3227 args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-3297 args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3848 Using Prefix Tree collator Using Prefix Tree collator Using Prefix Tree collator args.report_to: ['wandb'] args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3227 args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1027 args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1015 Using Prefix Tree collator Using Prefix Tree collator args.report_to: ['wandb'] args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1027 args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-1015 Using Prefix Tree collator args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1134 Using Prefix Tree collator args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-1134 Using Prefix Tree collator Using Prefix Tree collatorUsing Prefix Tree collator Using Prefix Tree collatorUsing Prefix Tree collator Using Prefix Tree collatorUsing Prefix Tree collatorUsing Prefix Tree collator args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-30_batch-block1-3887 args.report_to: ['wandb'] args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-30_batch-block1-3887 args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-30_batch-block1-3887 args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-30_batch-block1-3887 args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-30_batch-block1-3887 args.report_to: ['wandb'] args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-30_batch-block1-3887 args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-30_batch-block1-3887 args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-30_batch-block1-3887 Using Prefix Tree collator args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3227 Parameter Offload: Total persistent parameters: 308224 in 145 params {'train_runtime': 3.1214, 'train_samples_per_second': 2470.663, 'train_steps_per_second': 20.503, 'train_loss': 0.0, 'epoch': 8.0} wandb: wandb: 🚀 View run runs/dev/tw-8node-resume at: https://wandb.ai/ligeng-zhu/ThreadWeaver/runs/tw-8node-resume