SLURM_JOB_ID = 9168628 SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume RUN_NAME = tw-8node-resume OUTPUT_DIR = runs/dev/tw-8node-resume NNODES = 8 NODES = batch-block1-3061 batch-block1-3736 batch-block1-3305 batch-block1-3544 batch-block1-1089 batch-block1-3163 batch-block1-3038 batch-block1-3035 NODE_RANK = 0 GPUS_PER_NODE = 8 MASTER_ADDR = batch-block1-3061 MASTER_PORT = 25001 GLOBAL_TRAIN_BATCH_SIZE = GRADIENT_ACCUMULATION_STEPS = PER_DEVICE_TRAIN_BATCH_SIZE = Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64 SLURM_JOB_ID = 9168628 SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume RUN_NAME = tw-8node-resume OUTPUT_DIR = runs/dev/tw-8node-resume NNODES = 8 SLURM_JOB_ID = 9168628 SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume RUN_NAME = tw-8node-resume OUTPUT_DIR = runs/dev/tw-8node-resume NNODES = 8 SLURM_JOB_ID = 9168628 SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume RUN_NAME = tw-8node-resume OUTPUT_DIR = runs/dev/tw-8node-resume NNODES = 8 SLURM_JOB_ID = 9168628 SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume RUN_NAME = tw-8node-resume OUTPUT_DIR = runs/dev/tw-8node-resume NNODES = 8 SLURM_JOB_ID = 9168628 SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume RUN_NAME = tw-8node-resume OUTPUT_DIR = runs/dev/tw-8node-resume NNODES = 8 NODES = batch-block1-3061 batch-block1-3736 batch-block1-3305 batch-block1-3544 batch-block1-1089 batch-block1-3163 batch-block1-3038 batch-block1-3035 NODE_RANK = 7 GPUS_PER_NODE = 8 NODES = batch-block1-3061 batch-block1-3736 batch-block1-3305 batch-block1-3544 batch-block1-1089 batch-block1-3163 batch-block1-3038 batch-block1-3035 SLURM_JOB_ID = 9168628 SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume NODE_RANK = 2 GPUS_PER_NODE = 8 RUN_NAME = tw-8node-resume OUTPUT_DIR = runs/dev/tw-8node-resume NNODES = 8 NODES = batch-block1-3061 batch-block1-3736 batch-block1-3305 batch-block1-3544 batch-block1-1089 batch-block1-3163 batch-block1-3038 batch-block1-3035 NODE_RANK = 6 GPUS_PER_NODE = 8 NODES = batch-block1-3061 batch-block1-3736 batch-block1-3305 batch-block1-3544 batch-block1-1089 batch-block1-3163 batch-block1-3038 batch-block1-3035 NODE_RANK = 3 GPUS_PER_NODE = 8 NODES = batch-block1-3061 batch-block1-3736 batch-block1-3305 batch-block1-3544 batch-block1-1089 batch-block1-3163 batch-block1-3038 batch-block1-3035 NODE_RANK = 4 GPUS_PER_NODE = 8 MASTER_ADDR = batch-block1-3061 MASTER_PORT = 25001 GLOBAL_TRAIN_BATCH_SIZE = GRADIENT_ACCUMULATION_STEPS = PER_DEVICE_TRAIN_BATCH_SIZE = MASTER_ADDR = batch-block1-3061 MASTER_PORT = 25001 GLOBAL_TRAIN_BATCH_SIZE = GRADIENT_ACCUMULATION_STEPS = PER_DEVICE_TRAIN_BATCH_SIZE = MASTER_ADDR = batch-block1-3061 MASTER_PORT = 25001 GLOBAL_TRAIN_BATCH_SIZE = GRADIENT_ACCUMULATION_STEPS = PER_DEVICE_TRAIN_BATCH_SIZE = NODES = batch-block1-3061 batch-block1-3736 batch-block1-3305 batch-block1-3544 batch-block1-1089 batch-block1-3163 batch-block1-3038 batch-block1-3035 NODE_RANK = 5 GPUS_PER_NODE = 8 MASTER_ADDR = batch-block1-3061 MASTER_PORT = 25001 GLOBAL_TRAIN_BATCH_SIZE = GRADIENT_ACCUMULATION_STEPS = PER_DEVICE_TRAIN_BATCH_SIZE = Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64 Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64 MASTER_ADDR = batch-block1-3061 MASTER_PORT = 25001 GLOBAL_TRAIN_BATCH_SIZE = GRADIENT_ACCUMULATION_STEPS = PER_DEVICE_TRAIN_BATCH_SIZE = Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64 Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64 Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64 MASTER_ADDR = batch-block1-3061 MASTER_PORT = 25001 GLOBAL_TRAIN_BATCH_SIZE = GRADIENT_ACCUMULATION_STEPS = PER_DEVICE_TRAIN_BATCH_SIZE = Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64 SLURM_JOB_ID = 9168628 SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume RUN_NAME = tw-8node-resume OUTPUT_DIR = runs/dev/tw-8node-resume NNODES = 8 NODES = batch-block1-3061 batch-block1-3736 batch-block1-3305 batch-block1-3544 batch-block1-1089 batch-block1-3163 batch-block1-3038 batch-block1-3035 NODE_RANK = 1 GPUS_PER_NODE = 8 MASTER_ADDR = batch-block1-3061 MASTER_PORT = 25001 GLOBAL_TRAIN_BATCH_SIZE = GRADIENT_ACCUMULATION_STEPS = PER_DEVICE_TRAIN_BATCH_SIZE = Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1Imported prefix tree collator v1 Imported prefix tree collator v1Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 Imported prefix tree collator v1 [2026-04-13 22:59:31,354] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 22:59:31,508] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 22:59:31,522] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 22:59:31,534] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 22:59:31,537] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 22:59:31,571] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 22:59:31,609] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 22:59:31,624] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 22:59:31,732] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 22:59:31,735] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 22:59:31,750] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 22:59:31,765] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 22:59:31,766] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 22:59:31,812] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 22:59:31,824] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 22:59:31,830] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 22:59:31,843] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 22:59:31,848] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 22:59:31,851] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 22:59:31,858] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 22:59:31,863] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 22:59:31,870] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 22:59:31,870] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 22:59:31,880] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 22:59:31,882] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 22:59:31,886] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 22:59:31,888] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 22:59:31,890] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 22:59:31,892] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 22:59:31,894] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 22:59:31,894] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 22:59:31,896] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 22:59:31,900] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 22:59:31,904] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 22:59:31,954] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 22:59:31,956] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 22:59:31,962] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 22:59:31,968] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 22:59:31,970] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 22:59:31,978] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 22:59:32,119] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 22:59:32,126] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 22:59:32,136] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 22:59:32,172] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 22:59:32,178] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 22:59:32,183] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 22:59:32,197] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 22:59:32,205] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 22:59:32,207] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 22:59:32,210] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 22:59:32,211] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 22:59:32,295] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 22:59:32,309] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 22:59:32,309] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 22:59:32,311] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 22:59:32,314] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 22:59:39,924] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 22:59:40,048] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 22:59:40,160] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 22:59:40,170] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 22:59:40,319] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 22:59:40,344] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 22:59:40,387] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 22:59:40,400] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 22:59:40,426] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 22:59:40,432] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 22:59:40,447] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 22:59:40,478] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 22:59:40,482] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 22:59:40,490] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 22:59:40,508] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 22:59:40,545] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 22:59:40,551] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 22:59:40,569] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 22:59:40,571] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 22:59:40,577] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 22:59:40,638] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 22:59:40,638] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 22:59:40,657] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 22:59:40,666] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 22:59:40,671] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 22:59:40,677] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 22:59:40,681] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 22:59:40,682] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 22:59:40,715] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 22:59:40,751] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 22:59:40,753] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 22:59:40,756] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 22:59:40,778] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 22:59:40,812] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 22:59:40,826] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 22:59:40,888] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 22:59:40,912] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 22:59:40,934] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 22:59:40,938] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 22:59:40,941] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 22:59:40,952] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 22:59:40,984] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 22:59:40,989] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 22:59:41,068] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 22:59:41,107] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 22:59:41,124] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 22:59:41,126] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 22:59:41,129] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 22:59:41,133] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 22:59:41,138] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 22:59:41,345] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 22:59:41,463] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 22:59:41,478] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 22:59:41,507] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 22:59:41,508] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 22:59:41,517] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 22:59:42,784] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 22:59:42,797] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 22:59:42,811] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 22:59:42,813] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 22:59:42,824] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 22:59:42,827] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 22:59:42,828] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 22:59:42,842] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 22:59:42,883] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 22:59:42,935] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 22:59:42,947] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 22:59:42,949] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 22:59:42,954] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 22:59:42,962] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 22:59:42,962] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 22:59:42,968] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 22:59:42,971] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 22:59:47,499] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 22:59:47,501] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 22:59:47,501] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 22:59:47,540] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 22:59:47,553] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 22:59:47,561] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 22:59:47,562] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 22:59:47,587] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 22:59:47,590] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 22:59:47,594] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 22:59:47,595] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 22:59:47,597] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 22:59:47,599] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 22:59:47,610] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 22:59:47,612] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 22:59:47,613] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 22:59:47,696] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 22:59:52,018] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 22:59:52,018] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 22:59:52,018] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 22:59:52,018] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 22:59:52,018] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 22:59:52,018] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 22:59:52,018] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 22:59:52,018] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2026-04-13 22:59:52,424] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 22:59:52,454] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 22:59:52,470] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 22:59:52,483] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 22:59:52,483] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 22:59:52,486] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 22:59:52,504] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 22:59:52,505] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 22:59:52,509] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 22:59:52,510] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 22:59:52,513] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 22:59:52,538] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 22:59:52,540] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 22:59:52,541] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 22:59:52,541] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 22:59:52,542] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 22:59:52,545] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 22:59:52,550] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 22:59:52,551] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 22:59:52,570] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 22:59:52,572] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 22:59:52,575] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:00:03,732] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:00:03,732] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:00:03,732] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:00:03,732] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:00:03,732] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:00:03,732] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:00:03,732] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:00:03,732] [INFO] [comm.py:706:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl [2026-04-13 23:00:03,732] [INFO] [comm.py:675:init_distributed] cdb=None [2026-04-13 23:00:05,419] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:00:05,419] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:00:05,419] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:00:05,419] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:00:05,419] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:00:05,419] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:00:05,419] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:00:05,419] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64 [2026-04-13 23:00:10,135] [INFO] [partition_parameters.py:348:__exit__] finished initializing model - num_params = 399, num_elems = 8.19B Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64 Using Prefix Tree collator Using Prefix Tree collator args.report_to: ['wandb'] args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-41_batch-block1-3736 args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-1089 Using Prefix Tree collator Using Prefix Tree collator args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-1089 Using Prefix Tree collator args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3038 args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-39_batch-block1-3544 Using Prefix Tree collator args.report_to: ['wandb'] Using Prefix Tree collator args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3544 Using Prefix Tree collator args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3305 args.report_to: ['wandb'] Using Prefix Tree collator args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3038 Using Prefix Tree collator Using Prefix Tree collator Using Prefix Tree collator args.report_to: ['wandb'] Using Prefix Tree collator Using Prefix Tree collator Using Prefix Tree collator args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3035 Using Prefix Tree collator args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3163 args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3038 args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3038 args.report_to: ['wandb'] args.report_to: ['wandb'] args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3163 args.report_to: ['wandb'] Using Prefix Tree collator args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3038 Using Prefix Tree collator args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-41_batch-block1-3736 args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3035 Using Prefix Tree collator Using Prefix Tree collator args.report_to: ['wandb'] args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3038 Using Prefix Tree collator args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-1089 Using Prefix Tree collator args.report_to: ['wandb'] args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-41_batch-block1-3305 args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3544 args.report_to: ['wandb'] Using Prefix Tree collator args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3163 args.report_to: ['wandb'] Using Prefix Tree collator args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3305 args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-41_batch-block1-3305 Using Prefix Tree collator args.report_to: ['wandb'] Using Prefix Tree collator Using Prefix Tree collator args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-41_batch-block1-3736 args.report_to: ['wandb'] args.report_to: ['wandb'] args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-1089 args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3035 args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3038 Using Prefix Tree collator Using Prefix Tree collator Using Prefix Tree collator Using Prefix Tree collator Using Prefix Tree collator args.report_to: ['wandb'] Using Prefix Tree collator args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3038 args.report_to: ['wandb'] args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-41_batch-block1-3736 args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3544 Using Prefix Tree collator args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3163 args.report_to: ['wandb'] args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-1089 args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-41_batch-block1-3736 args.report_to: ['wandb'] Using Prefix Tree collatorUsing Prefix Tree collator args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-1089 Using Prefix Tree collator Using Prefix Tree collator args.report_to: ['wandb'] Using Prefix Tree collatorUsing Prefix Tree collator args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3163 Using Prefix Tree collator Using Prefix Tree collator args.report_to: ['wandb'] args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3163 args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3163 Using Prefix Tree collator args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3544 args.report_to: ['wandb'] args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3544 args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3544 args.report_to: ['wandb'] args.report_to: ['wandb'] args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-41_batch-block1-3736 args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3035 args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3163 Using Prefix Tree collator Using Prefix Tree collator args.report_to: ['wandb'] Using Prefix Tree collator args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-41_batch-block1-3736 args.report_to: ['wandb'] Using Prefix Tree collator args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3035 Using Prefix Tree collator args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-41_batch-block1-1089 args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-41_batch-block1-3736 args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3544 Using Prefix Tree collator args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3305 Using Prefix Tree collatorUsing Prefix Tree collator Using Prefix Tree collator args.report_to: ['wandb'] Using Prefix Tree collator Using Prefix Tree collator args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-41_batch-block1-3305 args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-41_batch-block1-3305 Using Prefix Tree collator args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3035 args.report_to: ['wandb'] args.report_to: ['wandb'] args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3035 args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-1089 args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3305 Using Prefix Tree collator args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3035 Using Prefix Tree collatorUsing Prefix Tree collatorUsing Prefix Tree collatorUsing Prefix Tree collator Using Prefix Tree collatorUsing Prefix Tree collator Using Prefix Tree collator Using Prefix Tree collator args.report_to: ['wandb']args.report_to: ['wandb']args.report_to: ['wandb'] args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-00-03_batch-block1-3061args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-00-03_batch-block1-3061args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-00-03_batch-block1-3061 args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-00-03_batch-block1-3061 args.report_to: ['wandb']args.report_to: ['wandb'] args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-00-03_batch-block1-3061args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-00-03_batch-block1-3061 args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-00-03_batch-block1-3061 args.report_to: ['wandb'] args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-00-03_batch-block1-3061 Parameter Offload: Total persistent parameters: 308224 in 145 params {'train_runtime': 3.8014, 'train_samples_per_second': 2028.721, 'train_steps_per_second': 16.836, 'train_loss': 0.0, 'epoch': 8.0} wandb: wandb: 🚀 View run runs/dev/tw-8node-resume at: https://wandb.ai/ligeng-zhu/ThreadWeaver/runs/tw-8node-resume