631 lines
38 KiB
Plaintext
631 lines
38 KiB
Plaintext
SLURM_JOB_ID = 9168619
|
||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||
RUN_NAME = tw-8node-resume
|
||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||
NNODES = 8
|
||
SLURM_JOB_ID = 9168619
|
||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||
RUN_NAME = tw-8node-resume
|
||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||
NNODES = 8
|
||
SLURM_JOB_ID = 9168619
|
||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||
RUN_NAME = tw-8node-resume
|
||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||
NNODES = 8
|
||
SLURM_JOB_ID = 9168619
|
||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||
RUN_NAME = tw-8node-resume
|
||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||
NNODES = 8
|
||
SLURM_JOB_ID = 9168619
|
||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||
RUN_NAME = tw-8node-resume
|
||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||
NNODES = 8
|
||
SLURM_JOB_ID = 9168619
|
||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||
RUN_NAME = tw-8node-resume
|
||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||
NNODES = 8
|
||
SLURM_JOB_ID = 9168619
|
||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||
RUN_NAME = tw-8node-resume
|
||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||
NNODES = 8
|
||
NODES = batch-block1-3273 batch-block1-3339 batch-block1-3736 batch-block1-3544 batch-block1-1089 batch-block1-3578 batch-block1-1062 batch-block1-3035
|
||
NODE_RANK = 3
|
||
GPUS_PER_NODE = 8
|
||
SLURM_JOB_ID = 9168619
|
||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||
RUN_NAME = tw-8node-resume
|
||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||
NNODES = 8
|
||
NODES = batch-block1-3273 batch-block1-3339 batch-block1-3736 batch-block1-3544 batch-block1-1089 batch-block1-3578 batch-block1-1062 batch-block1-3035
|
||
NODE_RANK = 7
|
||
GPUS_PER_NODE = 8
|
||
NODES = batch-block1-3273 batch-block1-3339 batch-block1-3736 batch-block1-3544 batch-block1-1089 batch-block1-3578 batch-block1-1062 batch-block1-3035
|
||
NODE_RANK = 0
|
||
GPUS_PER_NODE = 8
|
||
NODES = batch-block1-3273 batch-block1-3339 batch-block1-3736 batch-block1-3544 batch-block1-1089 batch-block1-3578 batch-block1-1062 batch-block1-3035
|
||
NODE_RANK = 6
|
||
GPUS_PER_NODE = 8
|
||
NODES = batch-block1-3273 batch-block1-3339 batch-block1-3736 batch-block1-3544 batch-block1-1089 batch-block1-3578 batch-block1-1062 batch-block1-3035
|
||
NODE_RANK = 4
|
||
GPUS_PER_NODE = 8
|
||
NODES = batch-block1-3273 batch-block1-3339 batch-block1-3736 batch-block1-3544 batch-block1-1089 batch-block1-3578 batch-block1-1062 batch-block1-3035
|
||
NODE_RANK = 5
|
||
GPUS_PER_NODE = 8
|
||
NODES = batch-block1-3273 batch-block1-3339 batch-block1-3736 batch-block1-3544 batch-block1-1089 batch-block1-3578 batch-block1-1062 batch-block1-3035
|
||
NODE_RANK = 1
|
||
GPUS_PER_NODE = 8
|
||
MASTER_ADDR = batch-block1-3273
|
||
MASTER_PORT = 25001
|
||
GLOBAL_TRAIN_BATCH_SIZE =
|
||
GRADIENT_ACCUMULATION_STEPS =
|
||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||
NODES = batch-block1-3273 batch-block1-3339 batch-block1-3736 batch-block1-3544 batch-block1-1089 batch-block1-3578 batch-block1-1062 batch-block1-3035
|
||
NODE_RANK = 2
|
||
GPUS_PER_NODE = 8
|
||
MASTER_ADDR = batch-block1-3273
|
||
MASTER_PORT = 25001
|
||
GLOBAL_TRAIN_BATCH_SIZE =
|
||
GRADIENT_ACCUMULATION_STEPS =
|
||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||
MASTER_ADDR = batch-block1-3273
|
||
MASTER_PORT = 25001
|
||
GLOBAL_TRAIN_BATCH_SIZE =
|
||
GRADIENT_ACCUMULATION_STEPS =
|
||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||
MASTER_ADDR = batch-block1-3273
|
||
MASTER_PORT = 25001
|
||
GLOBAL_TRAIN_BATCH_SIZE =
|
||
GRADIENT_ACCUMULATION_STEPS =
|
||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||
MASTER_ADDR = batch-block1-3273
|
||
MASTER_PORT = 25001
|
||
GLOBAL_TRAIN_BATCH_SIZE =
|
||
GRADIENT_ACCUMULATION_STEPS =
|
||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||
MASTER_ADDR = batch-block1-3273
|
||
MASTER_PORT = 25001
|
||
GLOBAL_TRAIN_BATCH_SIZE =
|
||
GRADIENT_ACCUMULATION_STEPS =
|
||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||
MASTER_ADDR = batch-block1-3273
|
||
MASTER_PORT = 25001
|
||
GLOBAL_TRAIN_BATCH_SIZE =
|
||
GRADIENT_ACCUMULATION_STEPS =
|
||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||
MASTER_ADDR = batch-block1-3273
|
||
MASTER_PORT = 25001
|
||
GLOBAL_TRAIN_BATCH_SIZE =
|
||
GRADIENT_ACCUMULATION_STEPS =
|
||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
[2026-04-13 22:55:05,743] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:55:05,789] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:55:05,789] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:55:05,789] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:55:05,790] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:55:05,790] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:55:05,791] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:55:05,810] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:55:05,826] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:55:05,827] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:55:05,834] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:55:05,834] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:55:05,856] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:55:05,856] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:55:05,858] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:55:05,860] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:55:05,867] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:55:05,875] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:55:05,882] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:55:05,882] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:55:05,883] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:55:05,887] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:55:05,888] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:55:05,890] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:55:05,892] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:55:05,894] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:55:05,896] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:55:05,898] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:55:05,899] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:55:05,903] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:55:05,906] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:55:05,910] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:55:06,046] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:55:06,077] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:55:06,102] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:55:06,128] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:55:06,130] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:55:06,131] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:55:06,192] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:55:06,192] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:55:06,192] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:55:06,199] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:55:06,212] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:55:06,224] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:55:06,225] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:55:06,228] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:55:06,229] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:55:06,237] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:55:06,259] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:55:06,271] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:55:06,290] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:55:06,376] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:55:06,379] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:55:06,383] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:55:06,385] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:55:06,385] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:55:06,641] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:55:06,666] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:55:06,714] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:55:06,736] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:55:06,769] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:55:06,786] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:55:06,791] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:55:06,799] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:55:14,032] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:55:14,042] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:55:14,434] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:55:14,435] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:55:14,438] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:55:14,446] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:55:14,532] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:55:14,533] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:55:14,538] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:55:14,538] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:55:14,541] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:55:14,600] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:55:14,603] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:55:14,615] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:55:14,621] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:55:14,648] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:55:14,665] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:55:14,682] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:55:14,696] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:55:14,718] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:55:14,760] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:55:14,821] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:55:14,830] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:55:14,867] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:55:14,875] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:55:14,879] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:55:14,883] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:55:14,888] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:55:14,894] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:55:14,910] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:55:14,924] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:55:14,924] [INFO] [comm.py:706:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
|
||
[2026-04-13 22:55:14,943] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:55:14,964] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:55:14,989] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:55:15,005] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:55:15,049] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:55:15,050] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:55:15,101] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:55:15,107] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:55:15,145] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:55:15,188] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:55:15,201] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:55:15,203] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:55:15,217] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:55:15,249] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:55:15,271] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:55:15,295] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:55:15,315] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:55:15,354] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:55:15,383] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:55:15,416] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:55:15,419] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:55:15,441] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:55:15,450] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:55:15,471] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:55:15,491] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:55:15,499] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:55:15,518] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:55:15,524] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:55:15,546] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:55:15,577] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:55:15,579] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:55:15,585] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:55:15,588] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:55:17,410] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:55:17,419] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:55:17,424] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:55:17,440] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:55:17,454] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:55:17,459] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:55:17,491] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:55:17,494] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:55:17,498] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:55:17,542] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:55:17,555] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:55:17,752] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:55:17,754] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:55:17,754] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:55:17,757] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:55:17,762] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:55:17,762] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:55:21,637] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:55:21,748] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:55:21,866] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:55:21,867] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:55:21,870] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:55:21,876] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:55:21,885] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:55:21,900] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:55:21,904] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:55:21,907] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:55:21,908] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:55:21,908] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:55:21,912] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:55:21,921] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:55:22,076] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:55:22,108] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:55:22,295] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:55:26,257] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:55:26,288] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:55:26,302] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:55:26,302] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:55:26,325] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:55:26,337] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:55:26,341] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:55:26,341] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:55:26,346] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:55:26,358] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:55:26,370] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:55:26,371] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:55:26,374] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:55:26,376] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:55:26,389] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:55:26,391] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:55:26,392] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:55:26,395] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:55:26,395] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:55:26,405] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:55:26,409] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:55:26,420] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:55:26,454] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:55:26,457] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:55:26,468] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:55:26,468] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:55:26,485] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:55:26,486] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:55:27,300] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:55:27,321] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:55:31,232] [INFO] [partition_parameters.py:348:__exit__] finished initializing model - num_params = 399, num_elems = 8.19B
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collator
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-1062
|
||
Using Prefix Tree collator
|
||
args.report_to: ['wandb']
|
||
Using Prefix Tree collator
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3578
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-1062
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-1089
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collator
|
||
args.report_to: ['wandb']
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3339
|
||
Using Prefix Tree collator
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-1062
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collator
|
||
args.report_to: ['wandb']
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collator
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3035
|
||
Using Prefix Tree collator
|
||
args.report_to: ['wandb']
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-3736
|
||
Using Prefix Tree collator
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-3544
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-1062
|
||
Using Prefix Tree collator
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-3736
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collator
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-1089
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-3544
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-3544
|
||
args.report_to: ['wandb']
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-3544
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-1089
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3035
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collator
|
||
args.report_to: ['wandb']
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-3544
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-1089
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-1062
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collator
|
||
args.report_to: ['wandb']
|
||
Using Prefix Tree collator
|
||
args.report_to: ['wandb']
|
||
Using Prefix Tree collator
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3035
|
||
Using Prefix Tree collator
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3339
|
||
Using Prefix Tree collator
|
||
args.report_to: ['wandb']
|
||
Using Prefix Tree collator
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3273
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3273
|
||
Using Prefix Tree collator
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3578
|
||
Using Prefix Tree collator
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3273
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3339
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-1089
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collator
|
||
args.report_to: ['wandb']
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3339
|
||
args.report_to: ['wandb']
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collator
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3035
|
||
Using Prefix Tree collator
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-1062
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-3736
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3578
|
||
args.report_to: ['wandb']
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-1062
|
||
Using Prefix Tree collator
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-3736
|
||
Using Prefix Tree collator
|
||
args.report_to: ['wandb']
|
||
Using Prefix Tree collator
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3035
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-1089
|
||
Using Prefix Tree collator
|
||
args.report_to: ['wandb']
|
||
args.report_to: ['wandb']
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3578
|
||
Using Prefix Tree collator
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-1062
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3339
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3578
|
||
args.report_to: ['wandb']
|
||
Using Prefix Tree collator
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3035
|
||
args.report_to: ['wandb']
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collator
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-3736
|
||
Using Prefix Tree collator
|
||
args.report_to: ['wandb']
|
||
args.report_to: ['wandb']
|
||
args.report_to: ['wandb']
|
||
Using Prefix Tree collator
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3339
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3273
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-1089
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-1089
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collator
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3544args.report_to: ['wandb']
|
||
args.report_to: ['wandb']
|
||
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-3544
|
||
Using Prefix Tree collator
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3578
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collator
|
||
args.report_to: ['wandb']
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-3736
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-3544
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-3578
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collator
|
||
args.report_to: ['wandb']
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-3736
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3339
|
||
args.report_to: ['wandb']
|
||
Using Prefix Tree collator
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3273
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3578
|
||
args.report_to: ['wandb']
|
||
Using Prefix Tree collator
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3736
|
||
Using Prefix Tree collator
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3273
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3273
|
||
Using Prefix Tree collator
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3339
|
||
Using Prefix Tree collator
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3273
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collator
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3035
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3035
|
||
Parameter Offload: Total persistent parameters: 308224 in 145 params
|
||
{'train_runtime': 3.3092, 'train_samples_per_second': 2330.489, 'train_steps_per_second': 19.34, 'train_loss': 0.0, 'epoch': 8.0}
|
||
[1;34mwandb[0m:
|
||
[1;34mwandb[0m: 🚀 View run [33mruns/dev/tw-8node-resume[0m at: [34mhttps://wandb.ai/ligeng-zhu/ThreadWeaver/runs/tw-8node-resume[0m
|