初始化项目,由ModelHub XC社区提供模型
Model: ligeng-dev/q3-8b-train_final_v2_nb2_mt8192_replaced_fix Source: Original Platform
This commit is contained in:
630
slurm/9168628.0.out
Normal file
630
slurm/9168628.0.out
Normal file
@@ -0,0 +1,630 @@
|
||||
SLURM_JOB_ID = 9168628
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
NODES = batch-block1-3061 batch-block1-3736 batch-block1-3305 batch-block1-3544 batch-block1-1089 batch-block1-3163 batch-block1-3038 batch-block1-3035
|
||||
NODE_RANK = 0
|
||||
GPUS_PER_NODE = 8
|
||||
MASTER_ADDR = batch-block1-3061
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
SLURM_JOB_ID = 9168628
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
SLURM_JOB_ID = 9168628
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
SLURM_JOB_ID = 9168628
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
SLURM_JOB_ID = 9168628
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
SLURM_JOB_ID = 9168628
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
NODES = batch-block1-3061 batch-block1-3736 batch-block1-3305 batch-block1-3544 batch-block1-1089 batch-block1-3163 batch-block1-3038 batch-block1-3035
|
||||
NODE_RANK = 7
|
||||
GPUS_PER_NODE = 8
|
||||
NODES = batch-block1-3061 batch-block1-3736 batch-block1-3305 batch-block1-3544 batch-block1-1089 batch-block1-3163 batch-block1-3038 batch-block1-3035
|
||||
SLURM_JOB_ID = 9168628
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
NODE_RANK = 2
|
||||
GPUS_PER_NODE = 8
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
NODES = batch-block1-3061 batch-block1-3736 batch-block1-3305 batch-block1-3544 batch-block1-1089 batch-block1-3163 batch-block1-3038 batch-block1-3035
|
||||
NODE_RANK = 6
|
||||
GPUS_PER_NODE = 8
|
||||
NODES = batch-block1-3061 batch-block1-3736 batch-block1-3305 batch-block1-3544 batch-block1-1089 batch-block1-3163 batch-block1-3038 batch-block1-3035
|
||||
NODE_RANK = 3
|
||||
GPUS_PER_NODE = 8
|
||||
NODES = batch-block1-3061 batch-block1-3736 batch-block1-3305 batch-block1-3544 batch-block1-1089 batch-block1-3163 batch-block1-3038 batch-block1-3035
|
||||
NODE_RANK = 4
|
||||
GPUS_PER_NODE = 8
|
||||
MASTER_ADDR = batch-block1-3061
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
MASTER_ADDR = batch-block1-3061
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
MASTER_ADDR = batch-block1-3061
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
NODES = batch-block1-3061 batch-block1-3736 batch-block1-3305 batch-block1-3544 batch-block1-1089 batch-block1-3163 batch-block1-3038 batch-block1-3035
|
||||
NODE_RANK = 5
|
||||
GPUS_PER_NODE = 8
|
||||
MASTER_ADDR = batch-block1-3061
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
MASTER_ADDR = batch-block1-3061
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
MASTER_ADDR = batch-block1-3061
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
SLURM_JOB_ID = 9168628
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
NODES = batch-block1-3061 batch-block1-3736 batch-block1-3305 batch-block1-3544 batch-block1-1089 batch-block1-3163 batch-block1-3038 batch-block1-3035
|
||||
NODE_RANK = 1
|
||||
GPUS_PER_NODE = 8
|
||||
MASTER_ADDR = batch-block1-3061
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1Imported prefix tree collator v1
|
||||
|
||||
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
[2026-04-13 22:59:31,354] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,508] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,522] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,534] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,537] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,571] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,609] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,624] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,732] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,735] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,750] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,765] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,766] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,812] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,824] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,830] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,843] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,848] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,851] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,858] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,863] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,870] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,870] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,880] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,882] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,886] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,888] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,890] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,892] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,894] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,894] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,896] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,900] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,904] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,954] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,956] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,962] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,968] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,970] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,978] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:32,119] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:32,126] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:32,136] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:32,172] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:32,178] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:32,183] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:32,197] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:32,205] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:32,207] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:32,210] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:32,211] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:32,295] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:32,309] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:32,309] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:32,311] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:32,314] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:39,924] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,048] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,160] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,170] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,319] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,344] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,387] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,400] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,426] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,432] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,447] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,478] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,482] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,490] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,508] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,545] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,551] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,569] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,571] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,577] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,638] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,638] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,657] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,666] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,671] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,677] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,681] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,682] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,715] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,751] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,753] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,756] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,778] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,812] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,826] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,888] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,912] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,934] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,938] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,941] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,952] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,984] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,989] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:41,068] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:41,107] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:41,124] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:41,126] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:41,129] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:41,133] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:41,138] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:41,345] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:41,463] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:41,478] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:41,507] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:41,508] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:41,517] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:42,784] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:42,797] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:42,811] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:42,813] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:42,824] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:42,827] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:42,828] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:42,842] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:42,883] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:42,935] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:42,947] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:42,949] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:42,954] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:42,962] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:42,962] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:42,968] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:42,971] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:47,499] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:47,501] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:47,501] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:47,540] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:47,553] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:47,561] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:47,562] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:47,587] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:47,590] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:47,594] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:47,595] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:47,597] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:47,599] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:47,610] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:47,612] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:47,613] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:47,696] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:52,018] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:52,018] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:52,018] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:52,018] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:52,018] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:52,018] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:52,018] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:52,018] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:52,424] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:52,454] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:52,470] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:52,483] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:52,483] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:52,486] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:52,504] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:52,505] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:52,509] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:52,510] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:52,513] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:52,538] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:52,540] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:52,541] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:52,541] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:52,542] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:52,545] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:52,550] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:52,551] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:52,570] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:52,572] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:52,575] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:00:03,732] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:00:03,732] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:00:03,732] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:00:03,732] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:00:03,732] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:00:03,732] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:00:03,732] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:00:03,732] [INFO] [comm.py:706:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
|
||||
[2026-04-13 23:00:03,732] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:00:05,419] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:00:05,419] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:00:05,419] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:00:05,419] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:00:05,419] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:00:05,419] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:00:05,419] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:00:05,419] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:00:10,135] [INFO] [partition_parameters.py:348:__exit__] finished initializing model - num_params = 399, num_elems = 8.19B
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
|
||||
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-41_batch-block1-3736
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-1089
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-1089
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3038
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-39_batch-block1-3544
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3544
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3305
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3038
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3035
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3163
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3038
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3038
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3163
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3038
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-41_batch-block1-3736
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3035
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3038
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-1089
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-41_batch-block1-3305
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3544
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3163
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3305
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-41_batch-block1-3305
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-41_batch-block1-3736
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-1089
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3035
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3038
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3038
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-41_batch-block1-3736
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3544
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3163
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-1089
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-41_batch-block1-3736
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collatorUsing Prefix Tree collator
|
||||
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-1089
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collatorUsing Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3163
|
||||
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3163
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3163
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3544
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3544
|
||||
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3544
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-41_batch-block1-3736
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3035
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3163
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-41_batch-block1-3736
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3035
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-41_batch-block1-1089
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-41_batch-block1-3736
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3544
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3305
|
||||
Using Prefix Tree collatorUsing Prefix Tree collator
|
||||
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-41_batch-block1-3305
|
||||
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-41_batch-block1-3305
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3035
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3035
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-1089
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3305
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3035
|
||||
Using Prefix Tree collatorUsing Prefix Tree collatorUsing Prefix Tree collatorUsing Prefix Tree collator
|
||||
|
||||
|
||||
|
||||
Using Prefix Tree collatorUsing Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']args.report_to: ['wandb']args.report_to: ['wandb']
|
||||
|
||||
|
||||
args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-00-03_batch-block1-3061args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-00-03_batch-block1-3061args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-00-03_batch-block1-3061
|
||||
|
||||
|
||||
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-00-03_batch-block1-3061
|
||||
args.report_to: ['wandb']args.report_to: ['wandb']
|
||||
|
||||
args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-00-03_batch-block1-3061args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-00-03_batch-block1-3061
|
||||
|
||||
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-00-03_batch-block1-3061
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-00-03_batch-block1-3061
|
||||
Parameter Offload: Total persistent parameters: 308224 in 145 params
|
||||
{'train_runtime': 3.8014, 'train_samples_per_second': 2028.721, 'train_steps_per_second': 16.836, 'train_loss': 0.0, 'epoch': 8.0}
|
||||
[1;34mwandb[0m:
|
||||
[1;34mwandb[0m: 🚀 View run [33mruns/dev/tw-8node-resume[0m at: [34mhttps://wandb.ai/ligeng-zhu/ThreadWeaver/runs/tw-8node-resume[0m
|
||||
Reference in New Issue
Block a user