初始化项目,由ModelHub XC社区提供模型
Model: ligeng-dev/q3-8b-train_final_v2_nb2_mt8192_replaced_fix Source: Original Platform
This commit is contained in:
630
slurm/9168643.0.out
Normal file
630
slurm/9168643.0.out
Normal file
@@ -0,0 +1,630 @@
|
||||
SLURM_JOB_ID = 9168643
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
NODES = batch-block1-1015 batch-block1-3297 batch-block1-3887 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3227
|
||||
NODE_RANK = 2
|
||||
GPUS_PER_NODE = 8
|
||||
MASTER_ADDR = batch-block1-1015
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
SLURM_JOB_ID = 9168643
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
SLURM_JOB_ID = 9168643
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
SLURM_JOB_ID = 9168643
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
SLURM_JOB_ID = 9168643
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
SLURM_JOB_ID = 9168643
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
SLURM_JOB_ID = 9168643
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
NODES = batch-block1-1015 batch-block1-3297 batch-block1-3887 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3227
|
||||
NODE_RANK = 6
|
||||
GPUS_PER_NODE = 8
|
||||
SLURM_JOB_ID = 9168643
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
NODES = batch-block1-1015 batch-block1-3297 batch-block1-3887 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3227
|
||||
NODE_RANK = 4
|
||||
GPUS_PER_NODE = 8
|
||||
NODES = batch-block1-1015 batch-block1-3297 batch-block1-3887 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3227
|
||||
NODE_RANK = 5
|
||||
GPUS_PER_NODE = 8
|
||||
NODES = batch-block1-1015 batch-block1-3297 batch-block1-3887 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3227
|
||||
NODE_RANK = 1
|
||||
GPUS_PER_NODE = 8
|
||||
NODES = batch-block1-1015 batch-block1-3297 batch-block1-3887 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3227
|
||||
NODE_RANK = 7
|
||||
GPUS_PER_NODE = 8
|
||||
NODES = batch-block1-1015 batch-block1-3297 batch-block1-3887 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3227
|
||||
NODE_RANK = 3
|
||||
GPUS_PER_NODE = 8
|
||||
MASTER_ADDR = batch-block1-1015
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
NODES = batch-block1-1015 batch-block1-3297 batch-block1-3887 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3227
|
||||
NODE_RANK = 0
|
||||
GPUS_PER_NODE = 8
|
||||
MASTER_ADDR = batch-block1-1015
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
MASTER_ADDR = batch-block1-1015
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
MASTER_ADDR = batch-block1-1015
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
MASTER_ADDR = batch-block1-1015
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
MASTER_ADDR = batch-block1-1015
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
MASTER_ADDR = batch-block1-1015
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1Imported prefix tree collator v1
|
||||
|
||||
Imported prefix tree collator v1Imported prefix tree collator v1
|
||||
|
||||
Imported prefix tree collator v1Imported prefix tree collator v1
|
||||
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
[2026-04-13 23:07:56,557] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:56,558] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:56,616] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:56,617] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:56,643] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:56,643] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:56,731] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:56,747] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:56,760] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:56,762] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:56,785] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:56,787] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:56,787] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:56,792] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:56,800] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:56,806] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:56,920] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:56,927] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:56,955] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,003] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,028] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,036] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,071] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,082] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,082] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,083] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,083] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,083] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,084] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,085] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,085] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,086] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,087] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,088] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,092] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,093] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,098] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,120] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,122] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,125] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,140] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,142] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,147] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,163] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,167] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,170] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,172] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,175] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,175] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,179] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,185] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,204] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,217] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,219] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,230] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,237] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:08:05,010] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,025] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,053] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,070] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,071] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,113] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,253] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,279] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,373] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,378] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,432] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,464] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,500] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,548] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,609] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,613] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,629] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,656] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,660] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,691] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,693] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,693] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,694] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,740] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,782] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,795] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,806] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,823] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,831] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,841] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,845] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,847] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,850] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,859] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,859] [INFO] [comm.py:706:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
|
||||
[2026-04-13 23:08:05,881] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,893] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,948] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,965] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,970] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,989] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:06,001] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:06,007] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:06,011] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:06,013] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:06,039] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:06,078] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:06,118] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:06,128] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:06,129] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:06,131] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:06,132] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:06,145] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:06,193] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:06,197] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:06,240] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:06,257] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:06,371] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:06,391] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:06,431] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:06,444] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:06,492] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:06,500] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:06,506] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:06,510] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:06,514] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:06,518] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:06,518] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:06,518] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:06,520] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:06,523] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:06,528] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:06,529] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:06,540] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:06,546] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:06,550] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:06,551] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:06,555] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:06,579] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:06,609] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:06,630] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:06,645] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:06,697] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:06,714] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:06,714] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:06,732] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:06,735] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:06,740] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:06,740] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:06,755] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:13,160] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:13,184] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:13,184] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:13,185] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:13,224] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:13,239] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:13,240] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:13,243] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:13,243] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:13,245] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:13,254] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:13,264] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:13,272] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:13,291] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:13,452] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:13,457] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:17,477] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:17,531] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:17,547] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:17,548] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:17,558] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:17,645] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:17,648] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:18,097] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:08:18,097] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:08:18,097] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:08:18,097] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:08:18,097] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:08:18,097] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:08:18,097] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:08:18,098] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:08:30,097] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:30,098] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:30,098] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:30,099] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:30,109] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:30,110] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:30,110] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:30,110] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:32,677] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:32,677] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:32,677] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:32,677] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:32,677] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:32,677] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:32,677] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:32,677] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:36,881] [INFO] [partition_parameters.py:348:__exit__] finished initializing model - num_params = 399, num_elems = 8.19B
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3273
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1134
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-3297
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3848
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3848
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1134
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-1134
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1015
|
||||
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1027
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3273
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-1015
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-3297
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-1015
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3848
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3848
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3273
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-3297
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1134
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3273
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-1015
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3273
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3273
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3273
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-3297
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-1134
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3848
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-1015
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3848
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1134
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3227
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3297
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1027
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3297
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3848
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3227
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-1015
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1027
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3273
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3227
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3227
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-3297
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1027
|
||||
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1027
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1027
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3227
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3227
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-3297
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3848
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3227
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1027
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1015
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1027
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-1015
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1134
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-1134
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collatorUsing Prefix Tree collator
|
||||
Using Prefix Tree collatorUsing Prefix Tree collator
|
||||
|
||||
|
||||
Using Prefix Tree collatorUsing Prefix Tree collatorUsing Prefix Tree collator
|
||||
|
||||
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-30_batch-block1-3887
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-30_batch-block1-3887
|
||||
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-30_batch-block1-3887
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-30_batch-block1-3887
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-30_batch-block1-3887
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-30_batch-block1-3887
|
||||
|
||||
args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-30_batch-block1-3887
|
||||
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-30_batch-block1-3887
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3227
|
||||
Parameter Offload: Total persistent parameters: 308224 in 145 params
|
||||
{'train_runtime': 3.1214, 'train_samples_per_second': 2470.663, 'train_steps_per_second': 20.503, 'train_loss': 0.0, 'epoch': 8.0}
|
||||
[1;34mwandb[0m:
|
||||
[1;34mwandb[0m: 🚀 View run [33mruns/dev/tw-8node-resume[0m at: [34mhttps://wandb.ai/ligeng-zhu/ThreadWeaver/runs/tw-8node-resume[0m
|
||||
Reference in New Issue
Block a user