631 lines
38 KiB
Plaintext
631 lines
38 KiB
Plaintext
SLURM_JOB_ID = 9168624
|
||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||
RUN_NAME = tw-8node-resume
|
||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||
NNODES = 8
|
||
SLURM_JOB_ID = 9168624
|
||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||
RUN_NAME = tw-8node-resume
|
||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||
NNODES = 8
|
||
NODES = batch-block1-3736 batch-block1-3305 batch-block1-3544 batch-block1-1089 batch-block1-3163 batch-block1-3038 batch-block1-1062 batch-block1-3035
|
||
NODE_RANK = 4
|
||
GPUS_PER_NODE = 8
|
||
NODES = batch-block1-3736 batch-block1-3305 batch-block1-3544 batch-block1-1089 batch-block1-3163 batch-block1-3038 batch-block1-1062 batch-block1-3035
|
||
NODE_RANK = 5
|
||
GPUS_PER_NODE = 8
|
||
MASTER_ADDR = batch-block1-3736
|
||
MASTER_PORT = 25001
|
||
GLOBAL_TRAIN_BATCH_SIZE =
|
||
GRADIENT_ACCUMULATION_STEPS =
|
||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||
MASTER_ADDR = batch-block1-3736
|
||
MASTER_PORT = 25001
|
||
GLOBAL_TRAIN_BATCH_SIZE =
|
||
GRADIENT_ACCUMULATION_STEPS =
|
||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||
SLURM_JOB_ID = 9168624
|
||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||
RUN_NAME = tw-8node-resume
|
||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||
NNODES = 8
|
||
SLURM_JOB_ID = 9168624
|
||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||
RUN_NAME = tw-8node-resume
|
||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||
NNODES = 8
|
||
SLURM_JOB_ID = 9168624
|
||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||
RUN_NAME = tw-8node-resume
|
||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||
NNODES = 8
|
||
SLURM_JOB_ID = 9168624
|
||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||
RUN_NAME = tw-8node-resume
|
||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||
NNODES = 8
|
||
NODES = batch-block1-3736 batch-block1-3305 batch-block1-3544 batch-block1-1089 batch-block1-3163 batch-block1-3038 batch-block1-1062 batch-block1-3035
|
||
NODE_RANK = 7
|
||
GPUS_PER_NODE = 8
|
||
NODES = batch-block1-3736 batch-block1-3305 batch-block1-3544 batch-block1-1089 batch-block1-3163 batch-block1-3038 batch-block1-1062 batch-block1-3035
|
||
NODE_RANK = 3
|
||
GPUS_PER_NODE = 8
|
||
SLURM_JOB_ID = 9168624
|
||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||
RUN_NAME = tw-8node-resume
|
||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||
NNODES = 8
|
||
NODES = batch-block1-3736 batch-block1-3305 batch-block1-3544 batch-block1-1089 batch-block1-3163 batch-block1-3038 batch-block1-1062 batch-block1-3035
|
||
NODE_RANK = 1
|
||
GPUS_PER_NODE = 8
|
||
SLURM_JOB_ID = 9168624
|
||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||
RUN_NAME = tw-8node-resume
|
||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||
NNODES = 8
|
||
MASTER_ADDR = batch-block1-3736
|
||
MASTER_PORT = 25001
|
||
GLOBAL_TRAIN_BATCH_SIZE =
|
||
GRADIENT_ACCUMULATION_STEPS =
|
||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||
NODES = batch-block1-3736 batch-block1-3305 batch-block1-3544 batch-block1-1089 batch-block1-3163 batch-block1-3038 batch-block1-1062 batch-block1-3035
|
||
NODE_RANK = 2
|
||
GPUS_PER_NODE = 8
|
||
MASTER_ADDR = batch-block1-3736
|
||
MASTER_PORT = 25001
|
||
GLOBAL_TRAIN_BATCH_SIZE =
|
||
GRADIENT_ACCUMULATION_STEPS =
|
||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||
NODES = batch-block1-3736 batch-block1-3305 batch-block1-3544 batch-block1-1089 batch-block1-3163 batch-block1-3038 batch-block1-1062 batch-block1-3035
|
||
NODE_RANK = 0
|
||
GPUS_PER_NODE = 8
|
||
MASTER_ADDR = batch-block1-3736
|
||
MASTER_PORT = 25001
|
||
GLOBAL_TRAIN_BATCH_SIZE =
|
||
GRADIENT_ACCUMULATION_STEPS =
|
||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||
NODES = batch-block1-3736 batch-block1-3305 batch-block1-3544 batch-block1-1089 batch-block1-3163 batch-block1-3038 batch-block1-1062 batch-block1-3035
|
||
NODE_RANK = 6
|
||
GPUS_PER_NODE = 8
|
||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||
MASTER_ADDR = batch-block1-3736
|
||
MASTER_PORT = 25001
|
||
GLOBAL_TRAIN_BATCH_SIZE =
|
||
GRADIENT_ACCUMULATION_STEPS =
|
||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||
MASTER_ADDR = batch-block1-3736
|
||
MASTER_PORT = 25001
|
||
GLOBAL_TRAIN_BATCH_SIZE =
|
||
GRADIENT_ACCUMULATION_STEPS =
|
||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||
MASTER_ADDR = batch-block1-3736
|
||
MASTER_PORT = 25001
|
||
GLOBAL_TRAIN_BATCH_SIZE =
|
||
GRADIENT_ACCUMULATION_STEPS =
|
||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1Imported prefix tree collator v1Imported prefix tree collator v1
|
||
|
||
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1Imported prefix tree collator v1Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
|
||
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
Imported prefix tree collator v1
|
||
[2026-04-13 22:57:05,613] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:57:05,613] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:57:05,675] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:57:05,705] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:57:05,869] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:57:05,871] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:57:05,874] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:57:05,876] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:57:05,882] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:57:05,885] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:57:05,902] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:57:05,962] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:57:05,967] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:57:05,997] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:57:06,005] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:57:06,008] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:57:06,011] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:57:06,016] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:57:06,041] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:57:06,048] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:57:06,055] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:57:06,076] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:57:06,078] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:57:06,108] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:57:06,119] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:57:06,122] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:57:06,280] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:57:06,346] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:57:06,401] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:57:06,418] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:57:06,442] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:57:06,456] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:57:06,457] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:57:06,461] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:57:06,468] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:57:06,480] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:57:06,504] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:57:06,506] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:57:06,510] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:57:06,519] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:57:13,815] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:57:13,817] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:57:14,049] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:57:14,097] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:57:14,123] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:57:14,212] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:57:14,298] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:57:14,326] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:57:14,331] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:57:14,339] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:57:14,424] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:57:14,502] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:57:14,508] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:57:14,571] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:57:14,570] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:57:14,576] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:57:14,581] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:57:14,593] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:57:14,603] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:57:14,603] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:57:14,635] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:57:14,657] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:57:14,659] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:57:14,667] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:57:14,690] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:57:14,754] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:57:14,768] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:57:14,800] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:57:14,820] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:57:14,894] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:57:15,006] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:57:15,054] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:57:15,358] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:57:15,377] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:57:15,407] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:57:15,465] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:57:15,467] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:57:15,467] [INFO] [comm.py:706:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
|
||
[2026-04-13 22:57:15,484] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:57:15,485] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:57:15,501] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:57:15,805] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:57:15,845] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:57:15,845] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:57:15,854] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:57:15,906] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:57:15,910] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:57:15,950] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:57:15,955] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:57:15,979] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:57:15,979] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:57:15,981] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:57:15,981] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:57:15,986] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:57:15,989] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:57:15,992] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:57:16,003] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:57:16,004] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:57:16,009] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:57:16,023] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:57:16,044] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:57:16,050] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:57:16,054] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:57:22,654] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:57:22,662] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:57:22,662] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:57:22,677] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:57:22,680] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:57:22,683] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:57:22,684] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:57:22,684] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:57:22,690] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:57:22,694] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:57:22,710] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:57:22,750] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:57:22,751] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:57:22,753] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:57:22,757] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:57:22,769] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:57:22,775] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:57:25,930] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:57:25,930] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:57:25,930] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:57:25,930] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:57:25,930] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:57:25,930] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:57:25,930] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:57:25,930] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:57:26,065] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:57:26,065] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:57:26,065] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:57:26,065] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:57:26,065] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:57:26,065] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:57:26,065] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:57:26,065] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:57:26,534] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:57:26,534] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:57:26,534] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:57:26,534] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:57:26,534] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:57:26,534] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:57:26,534] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:57:26,535] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||
[2026-04-13 22:57:26,700] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:57:37,768] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:57:37,768] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:57:37,768] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:57:37,768] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:57:37,768] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:57:37,768] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:57:37,768] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:57:37,769] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:57:37,796] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:57:37,796] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:57:37,796] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:57:37,796] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:57:37,796] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:57:37,796] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:57:37,796] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:57:37,796] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:57:38,359] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:57:38,359] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:57:38,359] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:57:38,359] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:57:38,359] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:57:38,359] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:57:38,359] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:57:38,360] [INFO] [comm.py:675:init_distributed] cdb=None
|
||
[2026-04-13 22:57:40,194] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:57:40,194] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:57:40,194] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:57:40,194] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:57:40,194] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:57:40,215] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:57:40,215] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:57:40,215] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:57:40,215] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:57:40,215] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:57:40,215] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:57:40,216] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:57:40,363] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:57:40,363] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:57:40,363] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:57:40,363] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:57:40,364] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:57:47,001] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:57:47,049] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:57:47,061] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:57:47,091] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:57:47,098] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:57:47,098] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:57:47,105] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||
[2026-04-13 22:57:50,876] [INFO] [partition_parameters.py:348:__exit__] finished initializing model - num_params = 399, num_elems = 8.19B
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
|
||
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||
Using Prefix Tree collator
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-1089
|
||
Using Prefix Tree collator
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-3035
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collator
|
||
args.report_to: ['wandb']
|
||
args.report_to: ['wandb']
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-1062
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-1089
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-13_batch-block1-3544
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collator
|
||
args.report_to: ['wandb']
|
||
Using Prefix Tree collator
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-1062
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-15_batch-block1-3736
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collator
|
||
args.report_to: ['wandb']
|
||
Using Prefix Tree collator
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-3544
|
||
Using Prefix Tree collator
|
||
args.report_to: ['wandb']
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-15_batch-block1-3736
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-13_batch-block1-3544
|
||
Using Prefix Tree collator
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-15_batch-block1-3736
|
||
args.report_to: ['wandb']
|
||
Using Prefix Tree collator
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-3035
|
||
args.report_to: ['wandb']
|
||
Using Prefix Tree collator
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-15_batch-block1-3736
|
||
args.report_to: ['wandb']
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collator
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-1062
|
||
Using Prefix Tree collator
|
||
args.report_to: ['wandb']
|
||
Using Prefix Tree collator
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-1089
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-1089
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-1089
|
||
Using Prefix Tree collator
|
||
args.report_to: ['wandb']
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-1062
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-3544
|
||
args.report_to: ['wandb']
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-3544
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-1062
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collator
|
||
args.report_to: ['wandb']
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-15_batch-block1-1062
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-15_batch-block1-3736
|
||
Using Prefix Tree collator
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-3035
|
||
Using Prefix Tree collator
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-1089
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collator
|
||
args.report_to: ['wandb']
|
||
args.report_to: ['wandb']
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collator
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-1062
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-3544
|
||
Using Prefix Tree collator
|
||
args.report_to: ['wandb']Using Prefix Tree collator
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-3035
|
||
|
||
Using Prefix Tree collator
|
||
args.report_to: ['wandb']
|
||
args.report_to: ['wandb']
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-3035
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-3035
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-1089
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-15_batch-block1-3736
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-1089
|
||
Using Prefix Tree collator
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-3544
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collator
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-3544
|
||
args.report_to: ['wandb']
|
||
Using Prefix Tree collatorUsing Prefix Tree collator
|
||
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-15_batch-block1-1062
|
||
args.report_to: ['wandb']
|
||
args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-3035
|
||
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-3035
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collator
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-15_batch-block1-3736
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-15_batch-block1-3736
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collatorUsing Prefix Tree collator
|
||
|
||
Using Prefix Tree collatorUsing Prefix Tree collator
|
||
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collator
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-37_batch-block1-3163
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-37_batch-block1-3163
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-37_batch-block1-3163
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-37_batch-block1-3163args.report_to: ['wandb']
|
||
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-37_batch-block1-3163
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-37_batch-block1-3163
|
||
Using Prefix Tree collator
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-37_batch-block1-3163
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-37_batch-block1-3163
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collatorUsing Prefix Tree collator
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collator
|
||
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collator
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-38_batch-block1-3305
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-38_batch-block1-3305
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-38_batch-block1-3305args.report_to: ['wandb']
|
||
args.report_to: ['wandb']
|
||
args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-38_batch-block1-3305
|
||
|
||
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-38_batch-block1-3305
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-38_batch-block1-3305
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-38_batch-block1-3305
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-38_batch-block1-3305
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collator
|
||
Using Prefix Tree collatorUsing Prefix Tree collator
|
||
Using Prefix Tree collator
|
||
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-37_batch-block1-3038
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-37_batch-block1-3038
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-37_batch-block1-3038
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-37_batch-block1-3038
|
||
args.report_to: ['wandb']
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-37_batch-block1-3038
|
||
args.report_to: ['wandb']args.report_to: ['wandb']
|
||
|
||
args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-37_batch-block1-3038
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-37_batch-block1-3038
|
||
|
||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-37_batch-block1-3038
|
||
Parameter Offload: Total persistent parameters: 308224 in 145 params
|
||
{'train_runtime': 4.1282, 'train_samples_per_second': 1868.144, 'train_steps_per_second': 15.503, 'train_loss': 0.0, 'epoch': 8.0}
|
||
[1;34mwandb[0m:
|
||
[1;34mwandb[0m: 🚀 View run [33mruns/dev/tw-8node-resume[0m at: [34mhttps://wandb.ai/ligeng-zhu/ThreadWeaver/runs/tw-8node-resume[0m
|