631 lines
38 KiB
Plaintext
631 lines
38 KiB
Plaintext
|
|
SLURM_JOB_ID = 9168633
|
|||
|
|
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
|||
|
|
RUN_NAME = tw-8node-resume
|
|||
|
|
OUTPUT_DIR = runs/dev/tw-8node-resume
|
|||
|
|
NNODES = 8
|
|||
|
|
NODES = batch-block1-1015 batch-block1-3297 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3908 batch-block1-3227
|
|||
|
|
NODE_RANK = 5
|
|||
|
|
GPUS_PER_NODE = 8
|
|||
|
|
SLURM_JOB_ID = 9168633
|
|||
|
|
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
|||
|
|
RUN_NAME = tw-8node-resume
|
|||
|
|
OUTPUT_DIR = runs/dev/tw-8node-resume
|
|||
|
|
NNODES = 8
|
|||
|
|
MASTER_ADDR = batch-block1-1015
|
|||
|
|
MASTER_PORT = 25001
|
|||
|
|
GLOBAL_TRAIN_BATCH_SIZE =
|
|||
|
|
GRADIENT_ACCUMULATION_STEPS =
|
|||
|
|
PER_DEVICE_TRAIN_BATCH_SIZE =
|
|||
|
|
SLURM_JOB_ID = 9168633
|
|||
|
|
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
|||
|
|
RUN_NAME = tw-8node-resume
|
|||
|
|
OUTPUT_DIR = runs/dev/tw-8node-resume
|
|||
|
|
NNODES = 8
|
|||
|
|
SLURM_JOB_ID = 9168633
|
|||
|
|
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
|||
|
|
RUN_NAME = tw-8node-resume
|
|||
|
|
OUTPUT_DIR = runs/dev/tw-8node-resume
|
|||
|
|
NNODES = 8
|
|||
|
|
NODES = batch-block1-1015 batch-block1-3297 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3908 batch-block1-3227
|
|||
|
|
NODE_RANK = 3
|
|||
|
|
GPUS_PER_NODE = 8
|
|||
|
|
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
NODES = batch-block1-1015 batch-block1-3297 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3908 batch-block1-3227
|
|||
|
|
NODE_RANK = 7
|
|||
|
|
GPUS_PER_NODE = 8
|
|||
|
|
NODES = batch-block1-1015 batch-block1-3297 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3908 batch-block1-3227
|
|||
|
|
NODE_RANK = 6
|
|||
|
|
GPUS_PER_NODE = 8
|
|||
|
|
MASTER_ADDR = batch-block1-1015
|
|||
|
|
MASTER_PORT = 25001
|
|||
|
|
GLOBAL_TRAIN_BATCH_SIZE =
|
|||
|
|
GRADIENT_ACCUMULATION_STEPS =
|
|||
|
|
PER_DEVICE_TRAIN_BATCH_SIZE =
|
|||
|
|
MASTER_ADDR = batch-block1-1015
|
|||
|
|
MASTER_PORT = 25001
|
|||
|
|
GLOBAL_TRAIN_BATCH_SIZE =
|
|||
|
|
GRADIENT_ACCUMULATION_STEPS =
|
|||
|
|
PER_DEVICE_TRAIN_BATCH_SIZE =
|
|||
|
|
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
MASTER_ADDR = batch-block1-1015
|
|||
|
|
MASTER_PORT = 25001
|
|||
|
|
GLOBAL_TRAIN_BATCH_SIZE =
|
|||
|
|
GRADIENT_ACCUMULATION_STEPS =
|
|||
|
|
PER_DEVICE_TRAIN_BATCH_SIZE =
|
|||
|
|
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
SLURM_JOB_ID = 9168633
|
|||
|
|
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
|||
|
|
RUN_NAME = tw-8node-resume
|
|||
|
|
OUTPUT_DIR = runs/dev/tw-8node-resume
|
|||
|
|
NNODES = 8
|
|||
|
|
NODES = batch-block1-1015 batch-block1-3297 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3908 batch-block1-3227
|
|||
|
|
NODE_RANK = 2
|
|||
|
|
GPUS_PER_NODE = 8
|
|||
|
|
SLURM_JOB_ID = 9168633
|
|||
|
|
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
|||
|
|
RUN_NAME = tw-8node-resume
|
|||
|
|
OUTPUT_DIR = runs/dev/tw-8node-resume
|
|||
|
|
NNODES = 8
|
|||
|
|
MASTER_ADDR = batch-block1-1015
|
|||
|
|
MASTER_PORT = 25001
|
|||
|
|
GLOBAL_TRAIN_BATCH_SIZE =
|
|||
|
|
GRADIENT_ACCUMULATION_STEPS =
|
|||
|
|
PER_DEVICE_TRAIN_BATCH_SIZE =
|
|||
|
|
NODES = batch-block1-1015 batch-block1-3297 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3908 batch-block1-3227
|
|||
|
|
NODE_RANK = 1
|
|||
|
|
GPUS_PER_NODE = 8
|
|||
|
|
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
SLURM_JOB_ID = 9168633
|
|||
|
|
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
|||
|
|
RUN_NAME = tw-8node-resume
|
|||
|
|
OUTPUT_DIR = runs/dev/tw-8node-resume
|
|||
|
|
NNODES = 8
|
|||
|
|
MASTER_ADDR = batch-block1-1015
|
|||
|
|
MASTER_PORT = 25001
|
|||
|
|
GLOBAL_TRAIN_BATCH_SIZE =
|
|||
|
|
GRADIENT_ACCUMULATION_STEPS =
|
|||
|
|
PER_DEVICE_TRAIN_BATCH_SIZE =
|
|||
|
|
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
NODES = batch-block1-1015 batch-block1-3297 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3908 batch-block1-3227
|
|||
|
|
NODE_RANK = 0
|
|||
|
|
GPUS_PER_NODE = 8
|
|||
|
|
MASTER_ADDR = batch-block1-1015
|
|||
|
|
MASTER_PORT = 25001
|
|||
|
|
GLOBAL_TRAIN_BATCH_SIZE =
|
|||
|
|
GRADIENT_ACCUMULATION_STEPS =
|
|||
|
|
PER_DEVICE_TRAIN_BATCH_SIZE =
|
|||
|
|
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
SLURM_JOB_ID = 9168633
|
|||
|
|
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
|||
|
|
RUN_NAME = tw-8node-resume
|
|||
|
|
OUTPUT_DIR = runs/dev/tw-8node-resume
|
|||
|
|
NNODES = 8
|
|||
|
|
NODES = batch-block1-1015 batch-block1-3297 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3908 batch-block1-3227
|
|||
|
|
NODE_RANK = 4
|
|||
|
|
GPUS_PER_NODE = 8
|
|||
|
|
MASTER_ADDR = batch-block1-1015
|
|||
|
|
MASTER_PORT = 25001
|
|||
|
|
GLOBAL_TRAIN_BATCH_SIZE =
|
|||
|
|
GRADIENT_ACCUMULATION_STEPS =
|
|||
|
|
PER_DEVICE_TRAIN_BATCH_SIZE =
|
|||
|
|
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
|
|||
|
|
Imported prefix tree collator v1Imported prefix tree collator v1
|
|||
|
|
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1Imported prefix tree collator v1
|
|||
|
|
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1Imported prefix tree collator v1
|
|||
|
|
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1Imported prefix tree collator v1
|
|||
|
|
|
|||
|
|
Imported prefix tree collator v1Imported prefix tree collator v1
|
|||
|
|
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1Imported prefix tree collator v1
|
|||
|
|
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1Imported prefix tree collator v1
|
|||
|
|
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
Imported prefix tree collator v1
|
|||
|
|
[2026-04-13 23:03:58,482] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:03:58,482] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:03:58,609] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:03:58,667] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:03:58,672] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:03:58,672] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:03:58,672] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:03:58,672] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:04:06,850] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:04:06,866] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:04:07,134] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:04:07,203] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:04:07,325] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:04:07,329] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:04:07,336] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:04:07,365] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:04:08,644] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:04:08,757] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:04:08,759] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:04:08,761] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:04:08,761] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:04:08,766] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:04:08,769] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:04:08,769] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:04:17,775] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:04:17,775] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:04:17,775] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:04:17,775] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:04:17,775] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:04:17,775] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:04:17,775] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:04:17,775] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:04:17,775] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:04:17,775] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:04:17,775] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:04:17,775] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:04:17,775] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:04:17,775] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:04:17,775] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:04:17,776] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:04:18,422] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:04:18,422] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:04:18,422] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:04:18,422] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:04:18,422] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:04:18,422] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:04:18,422] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:04:18,423] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:04:18,531] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:04:18,531] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:04:18,531] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:04:18,531] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:04:18,531] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:04:18,531] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:04:18,531] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:04:18,531] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:04:18,582] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:04:18,582] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:04:18,582] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:04:18,582] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:04:18,582] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:04:18,582] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:04:18,582] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:04:18,582] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:04:18,773] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:04:18,773] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:04:18,773] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:04:18,773] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:04:18,773] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:04:18,773] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:04:18,773] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:04:18,774] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:04:19,192] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:04:19,192] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:04:19,192] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:04:19,192] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:04:19,192] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:04:19,192] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:04:19,192] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:04:19,193] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
|||
|
|
[2026-04-13 23:04:29,337] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:04:29,337] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:04:29,337] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:04:29,337] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:04:29,337] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:04:29,337] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:04:29,337] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:04:29,337] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:04:29,376] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:04:29,376] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:04:29,376] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:04:29,376] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:04:29,376] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:04:29,376] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:04:29,376] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:04:29,376] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:04:30,218] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:04:30,218] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:04:30,218] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:04:30,218] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:04:30,218] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:04:30,218] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:04:30,218] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:04:30,218] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:04:30,223] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:04:30,223] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:04:30,223] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:04:30,223] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:04:30,223] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:04:30,224] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:04:30,224] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:04:30,224] [INFO] [comm.py:706:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
|
|||
|
|
[2026-04-13 23:04:30,224] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:04:30,348] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:04:30,348] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:04:30,348] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:04:30,348] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:04:30,349] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:04:30,349] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:04:30,349] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:04:30,349] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:04:30,351] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:04:30,351] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:04:30,351] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:04:30,351] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:04:30,351] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:04:30,351] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:04:30,351] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:04:30,351] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:04:31,121] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:04:31,122] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:04:31,122] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:04:31,172] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:04:31,172] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:04:31,172] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:04:31,172] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:04:31,172] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:04:31,210] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:04:31,224] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:04:31,224] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:04:31,224] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:04:31,224] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:04:31,224] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:04:31,224] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:04:31,224] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:04:31,224] [INFO] [comm.py:675:init_distributed] cdb=None
|
|||
|
|
[2026-04-13 23:04:31,227] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:04:31,246] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:04:31,247] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:04:31,358] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:04:31,358] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:04:31,358] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:04:31,359] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:04:31,893] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:04:31,893] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:04:31,893] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:04:31,893] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:04:31,893] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:04:31,894] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:04:31,894] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:04:31,895] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:04:36,779] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:04:36,822] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:04:36,822] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:04:36,850] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:04:36,862] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:04:36,874] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:04:36,896] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:04:36,900] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:04:36,906] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:04:36,908] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:04:36,911] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:04:36,914] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:04:36,923] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:04:36,932] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:04:36,932] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:04:36,953] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:04:36,955] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:04:36,961] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:04:36,969] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:04:36,969] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:04:36,969] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:04:36,974] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:04:36,976] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:04:36,991] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:04:37,042] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:04:37,043] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:04:37,075] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:04:37,097] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:04:37,103] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:04:37,104] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:04:37,117] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:04:37,195] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
|||
|
|
[2026-04-13 23:04:41,568] [INFO] [partition_parameters.py:348:__exit__] finished initializing model - num_params = 399, num_elems = 8.19B
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
|
|||
|
|
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
|
|||
|
|
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-07_batch-block1-3273
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-07_batch-block1-3273
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-07_batch-block1-3273
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-06_batch-block1-3273
|
|||
|
|
Using Prefix Tree collatorUsing Prefix Tree collator
|
|||
|
|
|
|||
|
|
args.report_to: ['wandb']args.report_to: ['wandb']
|
|||
|
|
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-06_batch-block1-3273args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-07_batch-block1-3273
|
|||
|
|
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-07_batch-block1-3273
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-07_batch-block1-3273
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
Using Prefix Tree collatorUsing Prefix Tree collator
|
|||
|
|
Using Prefix Tree collatorUsing Prefix Tree collator
|
|||
|
|
|
|||
|
|
|
|||
|
|
Using Prefix Tree collatorUsing Prefix Tree collatorUsing Prefix Tree collator
|
|||
|
|
|
|||
|
|
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-3908
|
|||
|
|
args.report_to: ['wandb']args.report_to: ['wandb']args.report_to: ['wandb']
|
|||
|
|
|
|||
|
|
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-3908args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-3908args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-3908
|
|||
|
|
|
|||
|
|
|
|||
|
|
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
Using Prefix Tree collatorUsing Prefix Tree collatorUsing Prefix Tree collator
|
|||
|
|
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-3908
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.report_to: ['wandb']args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-3908
|
|||
|
|
|
|||
|
|
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-3908args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-3908
|
|||
|
|
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1027
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1027
|
|||
|
|
args.report_to: ['wandb']args.report_to: ['wandb']
|
|||
|
|
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1027args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1027
|
|||
|
|
|
|||
|
|
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1027
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1027
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1027
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1027
|
|||
|
|
Using Prefix Tree collatorUsing Prefix Tree collatorUsing Prefix Tree collator
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
|
|||
|
|
|
|||
|
|
Using Prefix Tree collatorUsing Prefix Tree collatorUsing Prefix Tree collator
|
|||
|
|
|
|||
|
|
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
args.report_to: ['wandb']args.report_to: ['wandb']
|
|||
|
|
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-29_batch-block1-3227args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-29_batch-block1-3227
|
|||
|
|
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-29_batch-block1-3227
|
|||
|
|
args.report_to: ['wandb']args.report_to: ['wandb']args.report_to: ['wandb']
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-29_batch-block1-3227args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-29_batch-block1-3227args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-29_batch-block1-3227
|
|||
|
|
|
|||
|
|
|
|||
|
|
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-29_batch-block1-3227
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-29_batch-block1-3227
|
|||
|
|
Using Prefix Tree collatorUsing Prefix Tree collator
|
|||
|
|
|
|||
|
|
Using Prefix Tree collatorUsing Prefix Tree collator
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1015
|
|||
|
|
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1015
|
|||
|
|
args.report_to: ['wandb']args.report_to: ['wandb']
|
|||
|
|
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1015args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1015
|
|||
|
|
|
|||
|
|
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1015
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1015
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1015
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1015
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1134
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1134
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1134
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1134
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1134
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1134
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1134
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1134
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
Using Prefix Tree collatorUsing Prefix Tree collator
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
|
|||
|
|
Using Prefix Tree collatorUsing Prefix Tree collator
|
|||
|
|
|
|||
|
|
Using Prefix Tree collatorUsing Prefix Tree collator
|
|||
|
|
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-31_batch-block1-3848args.report_to: ['wandb']
|
|||
|
|
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-31_batch-block1-3848
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-31_batch-block1-3848
|
|||
|
|
args.report_to: ['wandb']args.report_to: ['wandb']
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-31_batch-block1-3848
|
|||
|
|
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-31_batch-block1-3848
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-31_batch-block1-3848
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-31_batch-block1-3848args.report_to: ['wandb']
|
|||
|
|
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-31_batch-block1-3848
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
Using Prefix Tree collatorUsing Prefix Tree collatorUsing Prefix Tree collatorUsing Prefix Tree collator
|
|||
|
|
|
|||
|
|
|
|||
|
|
|
|||
|
|
Using Prefix Tree collator
|
|||
|
|
Using Prefix Tree collatorUsing Prefix Tree collator
|
|||
|
|
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-29_batch-block1-3297
|
|||
|
|
args.report_to: ['wandb']args.report_to: ['wandb']
|
|||
|
|
|
|||
|
|
args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-29_batch-block1-3297args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-29_batch-block1-3297args.report_to: ['wandb']
|
|||
|
|
|
|||
|
|
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-29_batch-block1-3297
|
|||
|
|
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-29_batch-block1-3297
|
|||
|
|
args.report_to: ['wandb']
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-29_batch-block1-3297
|
|||
|
|
args.report_to: ['wandb']args.report_to: ['wandb']
|
|||
|
|
|
|||
|
|
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-29_batch-block1-3297args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-29_batch-block1-3297
|
|||
|
|
|
|||
|
|
Parameter Offload: Total persistent parameters: 308224 in 145 params
|
|||
|
|
{'train_runtime': 3.8131, 'train_samples_per_second': 2022.522, 'train_steps_per_second': 16.784, 'train_loss': 0.0, 'epoch': 8.0}
|
|||
|
|
[1;34mwandb[0m:
|
|||
|
|
[1;34mwandb[0m: 🚀 View run [33mruns/dev/tw-8node-resume[0m at: [34mhttps://wandb.ai/ligeng-zhu/ThreadWeaver/runs/tw-8node-resume[0m
|