631 lines
38 KiB
Plaintext
631 lines
38 KiB
Plaintext
SLURM_JOB_ID = 9168614
|
||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||
RUN_NAME = tw-8node-resume
|
||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||
NNODES = 8
|
||
NODES = batch-block1-3273 batch-block1-3339 batch-block1-3736 batch-block1-3544 batch-block1-1089 batch-block1-3578 batch-block1-1062 batch-block1-3035
|
||
NODE_RANK = 3
|
||
GPUS_PER_NODE = 8
|
||
MASTER_ADDR = batch-block1-3273
|
||
MASTER_PORT = 25001
|
||
GLOBAL_TRAIN_BATCH_SIZE =
|
||
GRADIENT_ACCUMULATION_STEPS =
|
||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||
SLURM_JOB_ID = 9168614
|
||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||
RUN_NAME = tw-8node-resume
|
||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||
NNODES = 8
|
||
SLURM_JOB_ID = 9168614
|
||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||
RUN_NAME = tw-8node-resume
|
||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||
NNODES = 8
|
||
SLURM_JOB_ID = 9168614
|
||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||
RUN_NAME = tw-8node-resume
|
||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||
NNODES = 8
|
||
SLURM_JOB_ID = 9168614
|
||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||
RUN_NAME = tw-8node-resume
|
||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||
NNODES = 8
|
||
SLURM_JOB_ID = 9168614
|
||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||
RUN_NAME = tw-8node-resume
|
||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||
NNODES = 8
|
||
SLURM_JOB_ID = 9168614
|
||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||
RUN_NAME = tw-8node-resume
|
||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||
NNODES = 8
|
||
SLURM_JOB_ID = 9168614
|
||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||
RUN_NAME = tw-8node-resume
|
||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||
NNODES = 8
|
||
NODES = batch-block1-3273 batch-block1-3339 batch-block1-3736 batch-block1-3544 batch-block1-1089 batch-block1-3578 batch-block1-1062 batch-block1-3035
|
||
NODE_RANK = 7
|
||
GPUS_PER_NODE = 8
|
||
NODES = batch-block1-3273 batch-block1-3339 batch-block1-3736 batch-block1-3544 batch-block1-1089 batch-block1-3578 batch-block1-1062 batch-block1-3035
|
||
NODE_RANK = 6
|
||
GPUS_PER_NODE = 8
|
||
NODES = batch-block1-3273 batch-block1-3339 batch-block1-3736 batch-block1-3544 batch-block1-1089 batch-block1-3578 batch-block1-1062 batch-block1-3035
|
||
NODE_RANK = 0
|
||
GPUS_PER_NODE = 8
|
||
NODES = batch-block1-3273 batch-block1-3339 batch-block1-3736 batch-block1-3544 batch-block1-1089 batch-block1-3578 batch-block1-1062 batch-block1-3035
|
||
NODE_RANK = 1
|
||
GPUS_PER_NODE = 8
|
||
NODES = batch-block1-3273 batch-block1-3339 batch-block1-3736 batch-block1-3544 batch-block1-1089 batch-block1-3578 batch-block1-1062 batch-block1-3035
|
||
NODE_RANK = 4
|
||
GPUS_PER_NODE = 8
|
||
NODES = batch-block1-3273 batch-block1-3339 batch-block1-3736 batch-block1-3544 batch-block1-1089 batch-block1-3578 batch-block1-1062 batch-block1-3035
|
||
NODE_RANK = 5
|
||
GPUS_PER_NODE = 8
|
||
MASTER_ADDR = batch-block1-3273
|
||
MASTER_PORT = 25001
|
||
GLOBAL_TRAIN_BATCH_SIZE =
|
||
GRADIENT_ACCUMULATION_STEPS =
|
||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||
MASTER_ADDR = batch-block1-3273
|
||
MASTER_PORT = 25001
|
||
GLOBAL_TRAIN_BATCH_SIZE =
|
||
GRADIENT_ACCUMULATION_STEPS =
|
||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||
NODES = batch-block1-3273 batch-block1-3339 batch-block1-3736 batch-block1-3544 batch-block1-1089 batch-block1-3578 batch-block1-1062 batch-block1-3035
|
||
NODE_RANK = 2
|
||
GPUS_PER_NODE = 8
|
||
MASTER_ADDR = batch-block1-3273
|
||
MASTER_PORT = 25001
|
||
GLOBAL_TRAIN_BATCH_SIZE =
|
||
GRADIENT_ACCUMULATION_STEPS =
|
||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||
MASTER_ADDR = batch-block1-3273
|
||
MASTER_PORT = 25001
|
||
GLOBAL_TRAIN_BATCH_SIZE =
|
||
GRADIENT_ACCUMULATION_STEPS =
|
||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||
MASTER_ADDR = batch-block1-3273
|
||
MASTER_ADDR = batch-block1-3273
|
||
MASTER_PORT = 25001
|
||
GLOBAL_TRAIN_BATCH_SIZE =
|
||
GRADIENT_ACCUMULATION_STEPS =
|
||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||
MASTER_PORT = 25001
|
||
GLOBAL_TRAIN_BATCH_SIZE =
|
||
GRADIENT_ACCUMULATION_STEPS =
|
||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||
MASTER_ADDR = batch-block1-3273
|
||
MASTER_PORT = 25001
|
||
GLOBAL_TRAIN_BATCH_SIZE =
|
||
GRADIENT_ACCUMULATION_STEPS =
|
||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1Imported prefix tree collator v1
|
||
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1Imported prefix tree collator v1
|
||
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1Imported prefix tree collator v1
|
||
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1Imported prefix tree collator v1
|
||
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1Imported prefix tree collator v1
|
||
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1Imported prefix tree collator v1
|
||
Imported prefix tree collator v1Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
|
||
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1Imported prefix tree collator v1
|
||
Imported prefix tree collator v1Imported prefix tree collator v1
|
||
|
||
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
[2026-04-13 22:52:44,641] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:52:44,641] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:52:44,641] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:52:44,641] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:52:44,641] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:52:44,641] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:52:44,641] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:52:44,641] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:52:44,823] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:52:44,823] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:52:44,823] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:52:44,823] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:52:44,823] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:52:44,823] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:52:44,823] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:52:44,824] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:52:44,830] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:52:44,830] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:52:44,830] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:52:44,830] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:52:44,830] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:52:44,830] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:52:44,830] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:52:44,831] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:52:45,044] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:52:45,044] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:52:45,044] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:52:45,044] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:52:45,044] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:52:45,044] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:52:45,044] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:52:45,044] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:52:45,505] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:52:45,505] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:52:45,505] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:52:45,505] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:52:45,505] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:52:45,505] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:52:45,505] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:52:45,505] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:52:45,710] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:52:45,710] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:52:45,710] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:52:45,710] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:52:45,710] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:52:45,710] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:52:45,710] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:52:45,711] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:52:46,098] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:52:46,098] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:52:46,098] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:52:46,098] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:52:46,098] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:52:46,098] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:52:46,098] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:52:46,099] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:52:46,114] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:52:46,114] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:52:46,114] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:52:46,114] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:52:46,114] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:52:46,114] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:52:46,114] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:52:46,115] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:52:57,932] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:52:57,932] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:52:57,932] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:52:57,932] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:52:57,932] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:52:57,932] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:52:57,932] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:52:57,932] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:52:58,617] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:52:58,617] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:52:58,617] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:52:58,617] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:52:58,617] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:52:58,617] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:52:58,618] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:52:58,618] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:52:58,976] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:52:58,976] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:52:58,976] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:52:58,976] [INFO] [comm.py:706:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
|
||
[2026-04-13 22:52:58,976] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:52:58,977] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:52:58,977] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:52:58,976] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:52:58,977] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:52:58,977] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:52:58,977] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:52:58,977] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:52:58,977] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:52:58,977] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:52:58,978] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:52:58,978] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:52:58,978] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:52:59,398] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:52:59,398] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:52:59,398] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:52:59,399] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:52:59,399] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:52:59,399] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:52:59,399] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:52:59,399] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:52:59,650] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:52:59,650] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:52:59,650] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:52:59,650] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:52:59,650] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:52:59,650] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:52:59,650] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:52:59,650] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:52:59,893] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:52:59,894] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:52:59,924] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:52:59,924] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:52:59,924] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:52:59,988] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:52:59,988] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:52:59,988] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:52:59,988] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:52:59,988] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:52:59,988] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:52:59,988] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:52:59,988] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:52:59,988] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:52:59,988] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:52:59,988] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:52:59,989] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:53:00,251] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:53:00,305] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:53:00,315] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:53:00,318] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:53:00,318] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:53:00,320] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:53:00,320] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:53:00,320] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:53:00,321] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:53:00,332] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:53:00,333] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:53:00,333] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:53:00,337] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:53:00,338] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:53:00,338] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:53:00,342] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:53:00,383] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:53:00,383] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:53:00,383] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:53:00,383] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:53:00,383] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:53:00,383] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:53:00,383] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:53:00,384] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:53:00,471] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:53:00,471] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:53:00,471] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:53:00,471] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:53:00,472] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:53:00,472] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:53:00,472] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:53:00,472] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:53:02,594] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:53:02,604] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:53:02,604] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:53:02,627] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:53:02,628] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:53:02,634] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:53:02,634] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:53:02,645] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:53:02,672] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:53:02,672] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:53:02,672] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:53:02,672] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:53:02,675] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:53:02,680] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:53:02,692] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:53:02,693] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:53:05,737] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:53:05,750] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:53:05,779] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:53:05,813] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:53:05,854] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:53:05,858] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:53:05,885] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:53:05,899] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:53:05,899] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:53:05,917] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:53:05,932] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:53:05,949] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:53:05,959] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:53:05,959] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:53:05,961] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:53:10,377] [INFO] [partition_parameters.py:348:__exit__] finished initializing model - num_params = 399, num_elems = 8.19B
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Prefix Tree collator
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-59_batch-block1-3578Using Prefix Tree collator
|
||
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collator
|
||
args.report_to: ['wandb']Using Prefix Tree collator
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-59_batch-block1-3578
|
||
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-59_batch-block1-3578
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-59_batch-block1-3578
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-59_batch-block1-3578
|
||
args.report_to: ['wandb']
|
||
args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-59_batch-block1-3578
|
||
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-59_batch-block1-3578
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-59_batch-block1-3578
|
||
Using Prefix Tree collatorUsing Prefix Tree collatorUsing Prefix Tree collator
|
||
|
||
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collator
|
||
args.report_to: ['wandb']args.report_to: ['wandb']
|
||
|
||
args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-1062args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-1062
|
||
|
||
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-1062
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-1062
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-1062
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-1062
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-1062
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-1062
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collatorUsing Prefix Tree collator
|
||
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collatorUsing Prefix Tree collator
|
||
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-53-00_batch-block1-3736
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-53-00_batch-block1-3736
|
||
args.report_to: ['wandb']args.report_to: ['wandb']
|
||
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-53-00_batch-block1-3736args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-53-00_batch-block1-3736
|
||
|
||
args.report_to: ['wandb']args.report_to: ['wandb']
|
||
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-53-00_batch-block1-3736args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-53-00_batch-block1-3736
|
||
|
||
args.report_to: ['wandb']
|
||
args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-53-00_batch-block1-3736
|
||
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-53-00_batch-block1-3736
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collatorUsing Prefix Tree collator
|
||
|
||
Using Prefix Tree collatorUsing Prefix Tree collator
|
||
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collator
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-53-00_batch-block1-1089
|
||
args.report_to: ['wandb']
|
||
args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-53-00_batch-block1-1089
|
||
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-53-00_batch-block1-1089
|
||
args.report_to: ['wandb']args.report_to: ['wandb']
|
||
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-53-00_batch-block1-1089args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-53-00_batch-block1-1089
|
||
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-53-00_batch-block1-1089
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-53-00_batch-block1-1089
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-53-00_batch-block1-1089
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collatorUsing Prefix Tree collator
|
||
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-3273
|
||
Using Prefix Tree collator
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-3273
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-3273
|
||
Using Prefix Tree collator
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-3273
|
||
Using Prefix Tree collatorargs.report_to: ['wandb']
|
||
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-3273
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collator
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-3273
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-3273
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-3273
|
||
Using Prefix Tree collatorUsing Prefix Tree collator
|
||
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collator
|
||
args.report_to: ['wandb']args.report_to: ['wandb']args.report_to: ['wandb']
|
||
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-59_batch-block1-3339args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-59_batch-block1-3339
|
||
|
||
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-59_batch-block1-3339
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-59_batch-block1-3339
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-59_batch-block1-3339
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-59_batch-block1-3339
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collator
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-59_batch-block1-3339args.report_to: ['wandb']
|
||
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-59_batch-block1-3339
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collatorUsing Prefix Tree collator
|
||
|
||
Using Prefix Tree collator
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-3035
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-3035
|
||
args.report_to: ['wandb']
|
||
args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-3035
|
||
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-3035
|
||
Using Prefix Tree collator
|
||
args.report_to: ['wandb']Using Prefix Tree collator
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-3035
|
||
|
||
Using Prefix Tree collator
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-3035
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-3035
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-3035
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collatorUsing Prefix Tree collatorUsing Prefix Tree collator
|
||
Using Prefix Tree collator
|
||
|
||
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collator
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-57_batch-block1-3544
|
||
args.report_to: ['wandb']args.report_to: ['wandb']
|
||
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-57_batch-block1-3544args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-57_batch-block1-3544
|
||
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-57_batch-block1-3544
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-57_batch-block1-3544
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-57_batch-block1-3544
|
||
args.report_to: ['wandb']args.report_to: ['wandb']
|
||
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-57_batch-block1-3544args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-57_batch-block1-3544
|
||
|
||
Parameter Offload: Total persistent parameters: 308224 in 145 params
|
||
{'train_runtime': 4.1111, 'train_samples_per_second': 1875.899, 'train_steps_per_second': 15.568, 'train_loss': 0.0, 'epoch': 8.0}
|
||
[1;34mwandb[0m:
|
||
[1;34mwandb[0m: 🚀 View run [33mruns/dev/tw-8node-resume[0m at: [34mhttps://wandb.ai/ligeng-zhu/ThreadWeaver/runs/tw-8node-resume[0m
|