631 lines
38 KiB
Plaintext
631 lines
38 KiB
Plaintext
|
|
SLURM_JOB_ID = 9168643
|
|||
|
|
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
|||
|
|
RUN_NAME = tw-8node-resume
|
|||
|
|
OUTPUT_DIR = runs/dev/tw-8node-resume
|
|||
|
|
NNODES = 8
|
|||
|
|
NODES = batch-block1-1015 batch-block1-3297 batch-block1-3887 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3227
|
|||
|
|
NODE_RANK = 2
|
|||
|
|
GPUS_PER_NODE = 8
|
|||
|
|
MASTER_ADDR = batch-block1-1015
|
|||
|
|
MASTER_PORT = 25001
|
|||
|
|
GLOBAL_TRAIN_BATCH_SIZE =
|
|||
|
|
GRADIENT_ACCUMULATION_STEPS =
|
|||
|
|
PER_DEVICE_TRAIN_BATCH_SIZE =
|
|||
|
|
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
SLURM_JOB_ID = 9168643
|
|||
|
|
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
|||
|
|
RUN_NAME = tw-8node-resume
|
|||
|
|
OUTPUT_DIR = runs/dev/tw-8node-resume
|
|||
|
|
NNODES = 8
|
|||
|
|
SLURM_JOB_ID = 9168643
|
|||
|
|
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
|||
|
|
RUN_NAME = tw-8node-resume
|
|||
|
|
OUTPUT_DIR = runs/dev/tw-8node-resume
|
|||
|
|
NNODES = 8
|
|||
|
|
SLURM_JOB_ID = 9168643
|
|||
|
|
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
|||
|
|
RUN_NAME = tw-8node-resume
|
|||
|
|
OUTPUT_DIR = runs/dev/tw-8node-resume
|
|||
|
|
NNODES = 8
|
|||
|
|
SLURM_JOB_ID = 9168643
|
|||
|
|
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
|||
|
|
RUN_NAME = tw-8node-resume
|
|||
|
|
OUTPUT_DIR = runs/dev/tw-8node-resume
|
|||
|
|
NNODES = 8
|
|||
|
|
SLURM_JOB_ID = 9168643
|
|||
|
|
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
|||
|
|
RUN_NAME = tw-8node-resume
|
|||
|
|
OUTPUT_DIR = runs/dev/tw-8node-resume
|
|||
|
|
NNODES = 8
|
|||
|
|
SLURM_JOB_ID = 9168643
|
|||
|
|
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
|||
|
|
RUN_NAME = tw-8node-resume
|
|||
|
|
OUTPUT_DIR = runs/dev/tw-8node-resume
|
|||
|
|
NNODES = 8
|
|||
|
|
NODES = batch-block1-1015 batch-block1-3297 batch-block1-3887 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3227
|
|||
|
|
NODE_RANK = 6
|
|||
|
|
GPUS_PER_NODE = 8
|
|||
|
|
SLURM_JOB_ID = 9168643
|
|||
|
|
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
|||
|
|
RUN_NAME = tw-8node-resume
|
|||
|
|
OUTPUT_DIR = runs/dev/tw-8node-resume
|
|||
|
|
NNODES = 8
|
|||
|
|
NODES = batch-block1-1015 batch-block1-3297 batch-block1-3887 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3227
|
|||
|
|
NODE_RANK = 4
|
|||
|
|
GPUS_PER_NODE = 8
|
|||
|
|
NODES = batch-block1-1015 batch-block1-3297 batch-block1-3887 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3227
|
|||
|
|
NODE_RANK = 5
|
|||
|
|
GPUS_PER_NODE = 8
|
|||
|
|
NODES = batch-block1-1015 batch-block1-3297 batch-block1-3887 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3227
|
|||
|
|
NODE_RANK = 1
|
|||
|
|
GPUS_PER_NODE = 8
|
|||
|
|
NODES = batch-block1-1015 batch-block1-3297 batch-block1-3887 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3227
|
|||
|
|
NODE_RANK = 7
|
|||
|
|
GPUS_PER_NODE = 8
|
|||
|
|
NODES = batch-block1-1015 batch-block1-3297 batch-block1-3887 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3227
|
|||
|
|
NODE_RANK = 3
|
|||
|
|
GPUS_PER_NODE = 8
|
|||
|
|
MASTER_ADDR = batch-block1-1015
|
|||
|
|
MASTER_PORT = 25001
|
|||
|
|
GLOBAL_TRAIN_BATCH_SIZE =
|
|||
|
|
GRADIENT_ACCUMULATION_STEPS =
|
|||
|
|
PER_DEVICE_TRAIN_BATCH_SIZE =
|
|||
|
|
NODES = batch-block1-1015 batch-block1-3297 batch-block1-3887 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3227
|
|||
|
|
NODE_RANK = 0
|
|||
|
|
GPUS_PER_NODE = 8
|
|||
|
|
MASTER_ADDR = batch-block1-1015
|
|||
|
|
MASTER_PORT = 25001
|
|||
|
|
GLOBAL_TRAIN_BATCH_SIZE =
|
|||
|
|
GRADIENT_ACCUMULATION_STEPS =
|
|||
|
|
PER_DEVICE_TRAIN_BATCH_SIZE =
|
|||
|
|
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
MASTER_ADDR = batch-block1-1015
|
|||
|
|
MASTER_PORT = 25001
|
|||
|
|
GLOBAL_TRAIN_BATCH_SIZE =
|
|||
|
|
GRADIENT_ACCUMULATION_STEPS =
|
|||
|
|
PER_DEVICE_TRAIN_BATCH_SIZE =
|
|||
|
|
MASTER_ADDR = batch-block1-1015
|
|||
|
|
MASTER_PORT = 25001
|
|||
|
|
GLOBAL_TRAIN_BATCH_SIZE =
|
|||
|
|
GRADIENT_ACCUMULATION_STEPS =
|
|||
|
|
PER_DEVICE_TRAIN_BATCH_SIZE =
|
|||
|
|
MASTER_ADDR = batch-block1-1015
|
|||
|
|
MASTER_PORT = 25001
|
|||
|
|
GLOBAL_TRAIN_BATCH_SIZE =
|
|||
|
|
GRADIENT_ACCUMULATION_STEPS =
|
|||
|
|
PER_DEVICE_TRAIN_BATCH_SIZE =
|
|||
|
|
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
MASTER_ADDR = batch-block1-1015
|
|||
|
|
MASTER_PORT = 25001
|
|||
|
|
GLOBAL_TRAIN_BATCH_SIZE =
|
|||
|
|
GRADIENT_ACCUMULATION_STEPS =
|
|||
|
|
PER_DEVICE_TRAIN_BATCH_SIZE =
|
|||
|
|
MASTER_ADDR = batch-block1-1015
|
|||
|
|
MASTER_PORT = 25001
|
|||
|
|
GLOBAL_TRAIN_BATCH_SIZE =
|
|||
|
|
GRADIENT_ACCUMULATION_STEPS =
|
|||
|
|
PER_DEVICE_TRAIN_BATCH_SIZE =
|
|||
|
|
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1Imported prefix tree collator v1
|
|||
|
|
|
|||
|
|
Imported prefix tree collator v1Imported prefix tree collator v1
|
|||
|
|
|
|||
|
|
Imported prefix tree collator v1Imported prefix tree collator v1
|
|||
|
|
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
[2026-04-13 23:07:56,557] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:07:56,558] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:07:56,616] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:07:56,617] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:07:56,643] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:07:56,643] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:07:56,731] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:07:56,747] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:07:56,760] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:07:56,762] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:07:56,785] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:07:56,787] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:07:56,787] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:07:56,792] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:07:56,800] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:07:56,806] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:07:56,920] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:07:56,927] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:07:56,955] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:07:57,003] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:07:57,028] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:07:57,036] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:07:57,071] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:07:57,082] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:07:57,082] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:07:57,083] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:07:57,083] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:07:57,083] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:07:57,084] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:07:57,085] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:07:57,085] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:07:57,086] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:07:57,087] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:07:57,088] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:07:57,092] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:07:57,093] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:07:57,098] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:07:57,120] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:07:57,122] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:07:57,125] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:07:57,140] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:07:57,142] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:07:57,147] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:07:57,163] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:07:57,167] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:07:57,170] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:07:57,172] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:07:57,175] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:07:57,175] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:07:57,179] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:07:57,185] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:07:57,204] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:07:57,217] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:07:57,219] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:07:57,230] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:07:57,237] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:08:05,010] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:08:05,025] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:08:05,053] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:08:05,070] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:08:05,071] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:08:05,113] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:08:05,253] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:08:05,279] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:08:05,373] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:08:05,378] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:08:05,432] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:08:05,464] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:08:05,500] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:08:05,548] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:08:05,609] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:08:05,613] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:08:05,629] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:08:05,656] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:08:05,660] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:08:05,691] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:08:05,693] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:08:05,693] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:08:05,694] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:08:05,740] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:08:05,782] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:08:05,795] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:08:05,806] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:08:05,823] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:08:05,831] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:08:05,841] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:08:05,845] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:08:05,847] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:08:05,850] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:08:05,859] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:08:05,859] [INFO] [comm.py:706:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
|
|||
|
|
[2026-04-13 23:08:05,881] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:08:05,893] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:08:05,948] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:08:05,965] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:08:05,970] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:08:05,989] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:08:06,001] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:08:06,007] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:08:06,011] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:08:06,013] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:08:06,039] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:08:06,078] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:08:06,118] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:08:06,128] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:08:06,129] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:08:06,131] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:08:06,132] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:08:06,145] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:08:06,193] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:08:06,197] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:08:06,240] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:08:06,257] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:08:06,371] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:08:06,391] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:08:06,431] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:08:06,444] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:08:06,492] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:08:06,500] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:08:06,506] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:08:06,510] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:08:06,514] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:08:06,518] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:08:06,518] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:08:06,518] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:08:06,520] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:08:06,523] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:08:06,528] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:08:06,529] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:08:06,540] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:08:06,546] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:08:06,550] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:08:06,551] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:08:06,555] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:08:06,579] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:08:06,609] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:08:06,630] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:08:06,645] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:08:06,697] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:08:06,714] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:08:06,714] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:08:06,732] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:08:06,735] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:08:06,740] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:08:06,740] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:08:06,755] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:08:13,160] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:08:13,184] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:08:13,184] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:08:13,185] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:08:13,224] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:08:13,239] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:08:13,240] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:08:13,243] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:08:13,243] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:08:13,245] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:08:13,254] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:08:13,264] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:08:13,272] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:08:13,291] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:08:13,452] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:08:13,457] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:08:17,477] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:08:17,531] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:08:17,547] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:08:17,548] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:08:17,558] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:08:17,645] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:08:17,648] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:08:18,097] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:08:18,097] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:08:18,097] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:08:18,097] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:08:18,097] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:08:18,097] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:08:18,097] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:08:18,098] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:08:30,097] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:08:30,098] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:08:30,098] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:08:30,099] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:08:30,109] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:08:30,110] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:08:30,110] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:08:30,110] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:08:32,677] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:08:32,677] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:08:32,677] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:08:32,677] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:08:32,677] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:08:32,677] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:08:32,677] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:08:32,677] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:08:36,881] [INFO] [partition_parameters.py:348:__exit__] finished initializing model - num_params = 399, num_elems = 8.19B
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3273
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1134
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-3297
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3848
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3848
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
args.report_to: ['wandb']Using Prefix Tree collator
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1134
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-1134
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1015
|
|||
|
|
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1027
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3273
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-1015
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-3297
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-1015
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3848
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3848
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3273
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-3297
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1134
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3273
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-1015
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3273
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3273
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3273
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-3297
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-1134
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3848
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-1015
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3848
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1134
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3227
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3297
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1027
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3297
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3848
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3227
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-1015
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1027
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3273
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3227
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3227
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-3297
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1027
|
|||
|
|
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1027
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1027
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3227
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3227
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-3297
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3848
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3227
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1027
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1015
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1027
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-1015
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1134
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-1134
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
Using Prefix Tree collatorUsing Prefix Tree collator
|
|||
|
|
Using Prefix Tree collatorUsing Prefix Tree collator
|
|||
|
|
|
|||
|
|
|
|||
|
|
Using Prefix Tree collatorUsing Prefix Tree collatorUsing Prefix Tree collator
|
|||
|
|
|
|||
|
|
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-30_batch-block1-3887
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-30_batch-block1-3887
|
|||
|
|
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-30_batch-block1-3887
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-30_batch-block1-3887
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-30_batch-block1-3887
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-30_batch-block1-3887
|
|||
|
|
|
|||
|
|
args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-30_batch-block1-3887
|
|||
|
|
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-30_batch-block1-3887
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3227
|
|||
|
|
Parameter Offload: Total persistent parameters: 308224 in 145 params
|
|||
|
|
{'train_runtime': 3.1214, 'train_samples_per_second': 2470.663, 'train_steps_per_second': 20.503, 'train_loss': 0.0, 'epoch': 8.0}
|
|||
|
|
[1;34mwandb[0m:
|
|||
|
|
[1;34mwandb[0m: 🚀 View run [33mruns/dev/tw-8node-resume[0m at: [34mhttps://wandb.ai/ligeng-zhu/ThreadWeaver/runs/tw-8node-resume[0m
|