初始化项目,由ModelHub XC社区提供模型
Model: ligeng-dev/q3-8b-train_final_v2_nb2_mt8192_replaced_fix Source: Original Platform
This commit is contained in:
630
slurm/9168640.0.out
Normal file
630
slurm/9168640.0.out
Normal file
@@ -0,0 +1,630 @@
|
||||
SLURM_JOB_ID = 9168640
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
SLURM_JOB_ID = 9168640
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
SLURM_JOB_ID = 9168640
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
SLURM_JOB_ID = 9168640
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
SLURM_JOB_ID = 9168640
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
SLURM_JOB_ID = 9168640
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
NODES = batch-block1-1015 batch-block1-3297 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3908 batch-block1-3227
|
||||
NODE_RANK = 3
|
||||
GPUS_PER_NODE = 8
|
||||
NODES = batch-block1-1015 batch-block1-3297 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3908 batch-block1-3227
|
||||
NODE_RANK = 7
|
||||
GPUS_PER_NODE = 8
|
||||
NODES = batch-block1-1015 batch-block1-3297 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3908 batch-block1-3227
|
||||
NODE_RANK = 4
|
||||
GPUS_PER_NODE = 8
|
||||
SLURM_JOB_ID = 9168640
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
NODES = batch-block1-1015 batch-block1-3297 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3908 batch-block1-3227
|
||||
NODE_RANK = 6
|
||||
GPUS_PER_NODE = 8
|
||||
NODES = batch-block1-1015 batch-block1-3297 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3908 batch-block1-3227
|
||||
NODE_RANK = 5
|
||||
GPUS_PER_NODE = 8
|
||||
NODES = batch-block1-1015 batch-block1-3297 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3908 batch-block1-3227
|
||||
NODE_RANK = 1
|
||||
GPUS_PER_NODE = 8
|
||||
MASTER_ADDR = batch-block1-1015
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
MASTER_ADDR = batch-block1-1015
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
MASTER_ADDR = batch-block1-1015
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
NODES = batch-block1-1015 batch-block1-3297 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3908 batch-block1-3227
|
||||
NODE_RANK = 2
|
||||
GPUS_PER_NODE = 8
|
||||
MASTER_ADDR = batch-block1-1015
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
MASTER_ADDR = batch-block1-1015
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
MASTER_ADDR = batch-block1-1015
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
MASTER_ADDR = batch-block1-1015
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
SLURM_JOB_ID = 9168640
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
NODES = batch-block1-1015 batch-block1-3297 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3908 batch-block1-3227
|
||||
NODE_RANK = 0
|
||||
GPUS_PER_NODE = 8
|
||||
MASTER_ADDR = batch-block1-1015
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
[2026-04-13 23:06:06,690] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,691] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,738] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,739] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,745] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,746] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,753] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,753] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,806] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,811] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,827] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,828] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,837] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,837] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,840] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,843] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,844] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,849] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,849] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,851] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,853] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,860] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,865] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,870] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,880] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,890] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,892] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,895] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,896] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,898] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,898] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,919] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,927] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,927] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,932] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,931] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,944] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,953] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,960] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,963] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,968] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,973] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,974] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,976] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,991] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:07,006] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:07,009] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:07,010] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:07,010] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:07,010] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:07,016] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:07,036] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:07,037] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:07,039] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:07,040] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:07,040] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:07,041] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:07,077] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:07,077] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:07,081] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:07,104] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:07,107] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:07,109] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:07,124] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:14,863] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:14,928] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:14,998] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:15,030] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:15,096] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:15,122] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:15,201] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:15,223] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:15,228] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:15,260] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:15,282] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:15,287] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:15,300] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:15,314] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:15,331] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:15,339] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:15,344] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:15,389] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:15,404] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:15,432] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:15,438] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:15,484] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:15,522] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:15,559] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:15,610] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:15,657] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:15,670] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:15,676] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:15,809] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:15,823] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:15,887] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:15,912] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:15,919] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:15,931] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:15,935] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:15,963] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:15,974] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:16,071] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:16,073] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:16,074] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:16,102] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:16,116] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:16,116] [INFO] [comm.py:706:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
|
||||
[2026-04-13 23:06:16,209] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:16,210] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:16,230] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:16,307] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:16,315] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:16,317] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:16,373] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:16,380] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:16,381] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:16,386] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:16,406] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:16,420] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:16,463] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:16,464] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:16,467] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:16,573] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:16,578] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:16,583] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:16,613] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:16,613] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:16,620] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:16,627] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:16,628] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:16,631] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:16,648] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:16,667] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:16,669] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:16,689] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:16,696] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:16,720] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:16,723] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:16,724] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:16,750] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:16,763] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:16,778] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:16,800] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:16,827] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:16,839] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:16,844] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:16,858] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:16,886] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:16,892] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:16,893] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:16,896] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:16,898] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:16,904] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:16,910] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:16,911] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:16,913] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:16,922] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:16,951] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:16,952] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:16,954] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:17,285] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:17,331] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:17,427] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:17,444] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:17,446] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:17,456] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:17,464] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:23,267] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:23,284] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:23,306] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:23,330] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:23,359] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:23,360] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:23,367] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:23,373] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:23,374] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:23,383] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:23,373] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:23,446] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:23,509] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:23,510] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:23,520] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:23,537] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:27,059] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:27,080] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:27,101] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:27,149] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:27,154] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:27,158] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:27,171] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:27,171] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:27,194] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:27,195] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:31,119] [INFO] [partition_parameters.py:348:__exit__] finished initializing model - num_params = 399, num_elems = 8.19B
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-3297
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-1134
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-1134
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-3848
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-1134
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-3848
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-3848
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-14_batch-block1-1027
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-1027
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-1134
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-3227
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-1015
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-3273
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-3848
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-3227
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-3297
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-3273
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-3273
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-3297
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-3227
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-1027
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-3848
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-1027
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-1015
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-1134
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-3297
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-3273
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-3848
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-3227
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-3848
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-3273
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-3227
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-1015
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-3908
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-3297
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collatorargs.report_to: ['wandb']
|
||||
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-3908
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-1134
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-3908
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-1015
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-3908
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-3908
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-3273
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-1134
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-1015
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-1015
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-3227
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-3273
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-1027
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-3297
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-3848
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-1027
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-1015
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-3908
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-1027
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-1015
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-3297
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-14_batch-block1-1027
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-3908
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-3273
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-3227
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-3227
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-3297
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-3908
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-14_batch-block1-1134
|
||||
Parameter Offload: Total persistent parameters: 308224 in 145 params
|
||||
{'train_runtime': 3.1226, 'train_samples_per_second': 2469.703, 'train_steps_per_second': 20.495, 'train_loss': 0.0, 'epoch': 8.0}
|
||||
[1;34mwandb[0m:
|
||||
[1;34mwandb[0m: 🚀 View run [33mruns/dev/tw-8node-resume[0m at: [34mhttps://wandb.ai/ligeng-zhu/ThreadWeaver/runs/tw-8node-resume[0m
|
||||
Reference in New Issue
Block a user