初始化项目,由ModelHub XC社区提供模型

Model: ligeng-dev/q3-8b-train_final_v2_nb2_mt8192_replaced_fix
Source: Original Platform
This commit is contained in:
ModelHub XC
2026-04-21 00:40:58 +08:00
commit 90ba3302c7
32 changed files with 203309 additions and 0 deletions

630
slurm/9168614.0.out Normal file
View File

@@ -0,0 +1,630 @@
SLURM_JOB_ID = 9168614
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
RUN_NAME = tw-8node-resume
OUTPUT_DIR = runs/dev/tw-8node-resume
NNODES = 8
NODES = batch-block1-3273 batch-block1-3339 batch-block1-3736 batch-block1-3544 batch-block1-1089 batch-block1-3578 batch-block1-1062 batch-block1-3035
NODE_RANK = 3
GPUS_PER_NODE = 8
MASTER_ADDR = batch-block1-3273
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE =
GRADIENT_ACCUMULATION_STEPS =
PER_DEVICE_TRAIN_BATCH_SIZE =
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
SLURM_JOB_ID = 9168614
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
RUN_NAME = tw-8node-resume
OUTPUT_DIR = runs/dev/tw-8node-resume
NNODES = 8
SLURM_JOB_ID = 9168614
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
RUN_NAME = tw-8node-resume
OUTPUT_DIR = runs/dev/tw-8node-resume
NNODES = 8
SLURM_JOB_ID = 9168614
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
RUN_NAME = tw-8node-resume
OUTPUT_DIR = runs/dev/tw-8node-resume
NNODES = 8
SLURM_JOB_ID = 9168614
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
RUN_NAME = tw-8node-resume
OUTPUT_DIR = runs/dev/tw-8node-resume
NNODES = 8
SLURM_JOB_ID = 9168614
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
RUN_NAME = tw-8node-resume
OUTPUT_DIR = runs/dev/tw-8node-resume
NNODES = 8
SLURM_JOB_ID = 9168614
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
RUN_NAME = tw-8node-resume
OUTPUT_DIR = runs/dev/tw-8node-resume
NNODES = 8
SLURM_JOB_ID = 9168614
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
RUN_NAME = tw-8node-resume
OUTPUT_DIR = runs/dev/tw-8node-resume
NNODES = 8
NODES = batch-block1-3273 batch-block1-3339 batch-block1-3736 batch-block1-3544 batch-block1-1089 batch-block1-3578 batch-block1-1062 batch-block1-3035
NODE_RANK = 7
GPUS_PER_NODE = 8
NODES = batch-block1-3273 batch-block1-3339 batch-block1-3736 batch-block1-3544 batch-block1-1089 batch-block1-3578 batch-block1-1062 batch-block1-3035
NODE_RANK = 6
GPUS_PER_NODE = 8
NODES = batch-block1-3273 batch-block1-3339 batch-block1-3736 batch-block1-3544 batch-block1-1089 batch-block1-3578 batch-block1-1062 batch-block1-3035
NODE_RANK = 0
GPUS_PER_NODE = 8
NODES = batch-block1-3273 batch-block1-3339 batch-block1-3736 batch-block1-3544 batch-block1-1089 batch-block1-3578 batch-block1-1062 batch-block1-3035
NODE_RANK = 1
GPUS_PER_NODE = 8
NODES = batch-block1-3273 batch-block1-3339 batch-block1-3736 batch-block1-3544 batch-block1-1089 batch-block1-3578 batch-block1-1062 batch-block1-3035
NODE_RANK = 4
GPUS_PER_NODE = 8
NODES = batch-block1-3273 batch-block1-3339 batch-block1-3736 batch-block1-3544 batch-block1-1089 batch-block1-3578 batch-block1-1062 batch-block1-3035
NODE_RANK = 5
GPUS_PER_NODE = 8
MASTER_ADDR = batch-block1-3273
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE =
GRADIENT_ACCUMULATION_STEPS =
PER_DEVICE_TRAIN_BATCH_SIZE =
MASTER_ADDR = batch-block1-3273
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE =
GRADIENT_ACCUMULATION_STEPS =
PER_DEVICE_TRAIN_BATCH_SIZE =
NODES = batch-block1-3273 batch-block1-3339 batch-block1-3736 batch-block1-3544 batch-block1-1089 batch-block1-3578 batch-block1-1062 batch-block1-3035
NODE_RANK = 2
GPUS_PER_NODE = 8
MASTER_ADDR = batch-block1-3273
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE =
GRADIENT_ACCUMULATION_STEPS =
PER_DEVICE_TRAIN_BATCH_SIZE =
MASTER_ADDR = batch-block1-3273
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE =
GRADIENT_ACCUMULATION_STEPS =
PER_DEVICE_TRAIN_BATCH_SIZE =
MASTER_ADDR = batch-block1-3273
MASTER_ADDR = batch-block1-3273
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE =
GRADIENT_ACCUMULATION_STEPS =
PER_DEVICE_TRAIN_BATCH_SIZE =
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE =
GRADIENT_ACCUMULATION_STEPS =
PER_DEVICE_TRAIN_BATCH_SIZE =
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
MASTER_ADDR = batch-block1-3273
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE =
GRADIENT_ACCUMULATION_STEPS =
PER_DEVICE_TRAIN_BATCH_SIZE =
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1Imported prefix tree collator v1
Imported prefix tree collator v1Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1Imported prefix tree collator v1
Imported prefix tree collator v1Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
[2026-04-13 22:52:44,641] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:44,641] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:44,641] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:44,641] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:44,641] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:44,641] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:44,641] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:44,641] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:44,823] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:44,823] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:44,823] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:44,823] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:44,823] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:44,823] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:44,823] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:44,824] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:44,830] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:44,830] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:44,830] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:44,830] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:44,830] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:44,830] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:44,830] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:44,831] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:45,044] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:45,044] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:45,044] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:45,044] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:45,044] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:45,044] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:45,044] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:45,044] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:45,505] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:45,505] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:45,505] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:45,505] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:45,505] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:45,505] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:45,505] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:45,505] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:45,710] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:45,710] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:45,710] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:45,710] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:45,710] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:45,710] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:45,710] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:45,711] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:46,098] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:46,098] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:46,098] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:46,098] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:46,098] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:46,098] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:46,098] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:46,099] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:46,114] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:46,114] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:46,114] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:46,114] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:46,114] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:46,114] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:46,114] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:46,115] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:57,932] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:57,932] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:57,932] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:57,932] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:57,932] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:57,932] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:57,932] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:57,932] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:58,617] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:58,617] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:58,617] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:58,617] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:58,617] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:58,617] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:58,618] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:58,618] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:58,976] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:58,976] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:58,976] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:58,976] [INFO] [comm.py:706:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
[2026-04-13 22:52:58,976] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:58,977] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:58,977] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:58,976] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:58,977] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:58,977] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:58,977] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:58,977] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:58,977] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:58,977] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:58,978] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:58,978] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:58,978] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:59,398] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:59,398] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:59,398] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:59,399] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:59,399] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:59,399] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:59,399] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:59,399] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:59,650] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:59,650] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:59,650] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:59,650] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:59,650] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:59,650] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:59,650] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:59,650] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:59,893] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:52:59,894] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:52:59,924] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:52:59,924] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:52:59,924] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:52:59,988] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:52:59,988] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:52:59,988] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:52:59,988] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:52:59,988] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:52:59,988] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:52:59,988] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:52:59,988] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:52:59,988] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:52:59,988] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:52:59,988] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:52:59,989] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:00,251] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:00,305] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:00,315] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:00,318] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:00,318] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:00,320] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:00,320] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:00,320] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:00,321] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:00,332] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:00,333] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:00,333] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:00,337] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:00,338] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:00,338] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:00,342] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:00,383] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:53:00,383] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:53:00,383] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:53:00,383] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:53:00,383] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:53:00,383] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:53:00,383] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:53:00,384] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:53:00,471] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:53:00,471] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:53:00,471] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:53:00,471] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:53:00,472] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:53:00,472] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:53:00,472] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:53:00,472] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:53:02,594] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:02,604] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:02,604] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:02,627] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:02,628] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:02,634] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:02,634] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:02,645] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:02,672] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:02,672] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:02,672] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:02,672] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:02,675] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:02,680] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:02,692] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:02,693] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:05,737] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:05,750] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:05,779] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:05,813] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:05,854] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:05,858] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:05,885] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:05,899] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:05,899] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:05,917] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:05,932] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:05,949] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:05,959] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:05,959] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:05,961] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:10,377] [INFO] [partition_parameters.py:348:__exit__] finished initializing model - num_params = 399, num_elems = 8.19B
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-59_batch-block1-3578Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']Using Prefix Tree collator
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-59_batch-block1-3578
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-59_batch-block1-3578
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-59_batch-block1-3578
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-59_batch-block1-3578
args.report_to: ['wandb']
args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-59_batch-block1-3578
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-59_batch-block1-3578
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-59_batch-block1-3578
Using Prefix Tree collatorUsing Prefix Tree collatorUsing Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']args.report_to: ['wandb']
args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-1062args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-1062
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-1062
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-1062
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-1062
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-1062
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-1062
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-1062
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collatorUsing Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collatorUsing Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-53-00_batch-block1-3736
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-53-00_batch-block1-3736
args.report_to: ['wandb']args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-53-00_batch-block1-3736args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-53-00_batch-block1-3736
args.report_to: ['wandb']args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-53-00_batch-block1-3736args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-53-00_batch-block1-3736
args.report_to: ['wandb']
args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-53-00_batch-block1-3736
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-53-00_batch-block1-3736
Using Prefix Tree collator
Using Prefix Tree collatorUsing Prefix Tree collator
Using Prefix Tree collatorUsing Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-53-00_batch-block1-1089
args.report_to: ['wandb']
args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-53-00_batch-block1-1089
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-53-00_batch-block1-1089
args.report_to: ['wandb']args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-53-00_batch-block1-1089args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-53-00_batch-block1-1089
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-53-00_batch-block1-1089
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-53-00_batch-block1-1089
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-53-00_batch-block1-1089
Using Prefix Tree collator
Using Prefix Tree collatorUsing Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-3273
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-3273
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-3273
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-3273
Using Prefix Tree collatorargs.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-3273
Using Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-3273
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-3273
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-3273
Using Prefix Tree collatorUsing Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']args.report_to: ['wandb']args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-59_batch-block1-3339args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-59_batch-block1-3339
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-59_batch-block1-3339
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-59_batch-block1-3339
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-59_batch-block1-3339
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-59_batch-block1-3339
Using Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-59_batch-block1-3339args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-59_batch-block1-3339
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collatorUsing Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-3035
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-3035
args.report_to: ['wandb']
args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-3035
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-3035
Using Prefix Tree collator
args.report_to: ['wandb']Using Prefix Tree collator
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-3035
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-3035
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-3035
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-3035
Using Prefix Tree collator
Using Prefix Tree collatorUsing Prefix Tree collatorUsing Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-57_batch-block1-3544
args.report_to: ['wandb']args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-57_batch-block1-3544args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-57_batch-block1-3544
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-57_batch-block1-3544
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-57_batch-block1-3544
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-57_batch-block1-3544
args.report_to: ['wandb']args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-57_batch-block1-3544args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-57_batch-block1-3544
Parameter Offload: Total persistent parameters: 308224 in 145 params
{'train_runtime': 4.1111, 'train_samples_per_second': 1875.899, 'train_steps_per_second': 15.568, 'train_loss': 0.0, 'epoch': 8.0}
wandb:
wandb: 🚀 View run runs/dev/tw-8node-resume at: https://wandb.ai/ligeng-zhu/ThreadWeaver/runs/tw-8node-resume