初始化项目,由ModelHub XC社区提供模型
Model: ligeng-dev/q3-8b-train_final_v2_nb2_mt8192_replaced_fix Source: Original Platform
This commit is contained in:
36
.gitattributes
vendored
Normal file
36
.gitattributes
vendored
Normal file
@@ -0,0 +1,36 @@
|
||||
*.7z filter=lfs diff=lfs merge=lfs -text
|
||||
*.arrow filter=lfs diff=lfs merge=lfs -text
|
||||
*.bin filter=lfs diff=lfs merge=lfs -text
|
||||
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
||||
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
||||
*.ftz filter=lfs diff=lfs merge=lfs -text
|
||||
*.gz filter=lfs diff=lfs merge=lfs -text
|
||||
*.h5 filter=lfs diff=lfs merge=lfs -text
|
||||
*.joblib filter=lfs diff=lfs merge=lfs -text
|
||||
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
||||
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
||||
*.model filter=lfs diff=lfs merge=lfs -text
|
||||
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
||||
*.npy filter=lfs diff=lfs merge=lfs -text
|
||||
*.npz filter=lfs diff=lfs merge=lfs -text
|
||||
*.onnx filter=lfs diff=lfs merge=lfs -text
|
||||
*.ot filter=lfs diff=lfs merge=lfs -text
|
||||
*.parquet filter=lfs diff=lfs merge=lfs -text
|
||||
*.pb filter=lfs diff=lfs merge=lfs -text
|
||||
*.pickle filter=lfs diff=lfs merge=lfs -text
|
||||
*.pkl filter=lfs diff=lfs merge=lfs -text
|
||||
*.pt filter=lfs diff=lfs merge=lfs -text
|
||||
*.pth filter=lfs diff=lfs merge=lfs -text
|
||||
*.rar filter=lfs diff=lfs merge=lfs -text
|
||||
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
||||
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
||||
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
||||
*.tar filter=lfs diff=lfs merge=lfs -text
|
||||
*.tflite filter=lfs diff=lfs merge=lfs -text
|
||||
*.tgz filter=lfs diff=lfs merge=lfs -text
|
||||
*.wasm filter=lfs diff=lfs merge=lfs -text
|
||||
*.xz filter=lfs diff=lfs merge=lfs -text
|
||||
*.zip filter=lfs diff=lfs merge=lfs -text
|
||||
*.zst filter=lfs diff=lfs merge=lfs -text
|
||||
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
||||
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
||||
58
README.md
Normal file
58
README.md
Normal file
@@ -0,0 +1,58 @@
|
||||
---
|
||||
base_model: Qwen/Qwen3-8B
|
||||
library_name: transformers
|
||||
model_name: tw-8node-resume
|
||||
tags:
|
||||
- generated_from_trainer
|
||||
- trl
|
||||
- sft
|
||||
licence: license
|
||||
---
|
||||
|
||||
# Model Card for tw-8node-resume
|
||||
|
||||
This model is a fine-tuned version of [Qwen/Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B).
|
||||
It has been trained using [TRL](https://github.com/huggingface/trl).
|
||||
|
||||
## Quick start
|
||||
|
||||
```python
|
||||
from transformers import pipeline
|
||||
|
||||
question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
|
||||
generator = pipeline("text-generation", model="None", device="cuda")
|
||||
output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
|
||||
print(output["generated_text"])
|
||||
```
|
||||
|
||||
## Training procedure
|
||||
|
||||
[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/ligeng-zhu/ThreadWeaver/runs/tw-8node-resume)
|
||||
|
||||
|
||||
This model was trained with SFT.
|
||||
|
||||
### Framework versions
|
||||
|
||||
- TRL: 0.19.0
|
||||
- Transformers: 4.51.1
|
||||
- Pytorch: 2.6.0
|
||||
- Datasets: 3.6.0
|
||||
- Tokenizers: 0.21.1
|
||||
|
||||
## Citations
|
||||
|
||||
|
||||
|
||||
Cite TRL as:
|
||||
|
||||
```bibtex
|
||||
@misc{vonwerra2022trl,
|
||||
title = {{TRL: Transformer Reinforcement Learning}},
|
||||
author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec},
|
||||
year = 2020,
|
||||
journal = {GitHub repository},
|
||||
publisher = {GitHub},
|
||||
howpublished = {\url{https://github.com/huggingface/trl}}
|
||||
}
|
||||
```
|
||||
44
added_tokens.json
Normal file
44
added_tokens.json
Normal file
@@ -0,0 +1,44 @@
|
||||
{
|
||||
"</Conclusion>": 151684,
|
||||
"</Outline>": 151676,
|
||||
"</Outlines>": 151674,
|
||||
"</Parallel>": 151672,
|
||||
"</Subtask>": 151680,
|
||||
"</Think>": 151670,
|
||||
"</Thread>": 151682,
|
||||
"</Trial>": 151678,
|
||||
"</think>": 151668,
|
||||
"</tool_call>": 151658,
|
||||
"</tool_response>": 151666,
|
||||
"<Conclusion>": 151683,
|
||||
"<Outline>": 151675,
|
||||
"<Outlines>": 151673,
|
||||
"<Parallel>": 151671,
|
||||
"<Subtask>": 151679,
|
||||
"<Think>": 151669,
|
||||
"<Thread>": 151681,
|
||||
"<Trial>": 151677,
|
||||
"<think>": 151667,
|
||||
"<tool_call>": 151657,
|
||||
"<tool_response>": 151665,
|
||||
"<|box_end|>": 151649,
|
||||
"<|box_start|>": 151648,
|
||||
"<|endoftext|>": 151643,
|
||||
"<|file_sep|>": 151664,
|
||||
"<|fim_middle|>": 151660,
|
||||
"<|fim_pad|>": 151662,
|
||||
"<|fim_prefix|>": 151659,
|
||||
"<|fim_suffix|>": 151661,
|
||||
"<|im_end|>": 151645,
|
||||
"<|im_start|>": 151644,
|
||||
"<|image_pad|>": 151655,
|
||||
"<|object_ref_end|>": 151647,
|
||||
"<|object_ref_start|>": 151646,
|
||||
"<|quad_end|>": 151651,
|
||||
"<|quad_start|>": 151650,
|
||||
"<|repo_name|>": 151663,
|
||||
"<|video_pad|>": 151656,
|
||||
"<|vision_end|>": 151653,
|
||||
"<|vision_pad|>": 151654,
|
||||
"<|vision_start|>": 151652
|
||||
}
|
||||
30
config.json
Normal file
30
config.json
Normal file
@@ -0,0 +1,30 @@
|
||||
{
|
||||
"architectures": [
|
||||
"Qwen3ForCausalLM"
|
||||
],
|
||||
"attention_bias": false,
|
||||
"attention_dropout": 0.0,
|
||||
"bos_token_id": 151643,
|
||||
"eos_token_id": 151645,
|
||||
"head_dim": 128,
|
||||
"hidden_act": "silu",
|
||||
"hidden_size": 4096,
|
||||
"initializer_range": 0.02,
|
||||
"intermediate_size": 12288,
|
||||
"max_position_embeddings": 40960,
|
||||
"max_window_layers": 36,
|
||||
"model_type": "qwen3",
|
||||
"num_attention_heads": 32,
|
||||
"num_hidden_layers": 36,
|
||||
"num_key_value_heads": 8,
|
||||
"rms_norm_eps": 1e-06,
|
||||
"rope_scaling": null,
|
||||
"rope_theta": 1000000,
|
||||
"sliding_window": null,
|
||||
"tie_word_embeddings": false,
|
||||
"torch_dtype": "bfloat16",
|
||||
"transformers_version": "4.51.1",
|
||||
"use_cache": true,
|
||||
"use_sliding_window": false,
|
||||
"vocab_size": 151744
|
||||
}
|
||||
13
generation_config.json
Normal file
13
generation_config.json
Normal file
@@ -0,0 +1,13 @@
|
||||
{
|
||||
"bos_token_id": 151643,
|
||||
"do_sample": true,
|
||||
"eos_token_id": [
|
||||
151645,
|
||||
151643
|
||||
],
|
||||
"pad_token_id": 151643,
|
||||
"temperature": 0.6,
|
||||
"top_k": 20,
|
||||
"top_p": 0.95,
|
||||
"transformers_version": "4.51.1"
|
||||
}
|
||||
151388
merges.txt
Normal file
151388
merges.txt
Normal file
File diff suppressed because it is too large
Load Diff
3
model-00001-of-00004.safetensors
Normal file
3
model-00001-of-00004.safetensors
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:c7518f9abec80fc05f085a7ca1624f27603d691322eeba2030c1a894912e54a3
|
||||
size 4900684832
|
||||
3
model-00002-of-00004.safetensors
Normal file
3
model-00002-of-00004.safetensors
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:32cb63f1f06c832ec8a5df1f6c6c6a1ecc1e285bbed2690e39d1d9debb78aa39
|
||||
size 4915960368
|
||||
3
model-00003-of-00004.safetensors
Normal file
3
model-00003-of-00004.safetensors
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:ac906db1972bd385dfab4fe08ec3037fcc758ad6046c5196122405b0f6e0fd92
|
||||
size 4983068496
|
||||
3
model-00004-of-00004.safetensors
Normal file
3
model-00004-of-00004.safetensors
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:4fc7d83d34247e2ecbb46c070115e6e4356a21365fbe400356a5dbec1b816432
|
||||
size 1578657400
|
||||
406
model.safetensors.index.json
Normal file
406
model.safetensors.index.json
Normal file
@@ -0,0 +1,406 @@
|
||||
{
|
||||
"metadata": {
|
||||
"total_size": 16378324992
|
||||
},
|
||||
"weight_map": {
|
||||
"lm_head.weight": "model-00004-of-00004.safetensors",
|
||||
"model.embed_tokens.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.0.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.0.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.1.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.1.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.10.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.10.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.11.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.11.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.12.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.12.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.13.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.13.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.14.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.14.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.15.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.15.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.16.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.16.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.17.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.17.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.18.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.18.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.19.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.19.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.2.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.2.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.20.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.20.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.20.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.20.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.20.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.20.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.21.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.21.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.21.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.21.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.21.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.21.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.21.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.21.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.21.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.21.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.21.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.22.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.22.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.22.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.22.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.22.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.22.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.23.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.23.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.24.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.24.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.25.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.25.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.26.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.26.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.27.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.27.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.28.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.28.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.29.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.29.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.3.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.3.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.30.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.30.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.31.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.31.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.31.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.31.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.31.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.32.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.32.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.32.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.32.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.32.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.32.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.32.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.32.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.32.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.32.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.32.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.33.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.33.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.33.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.33.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.33.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.33.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.33.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.33.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.33.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.33.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.33.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.34.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.34.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.34.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.34.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.34.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.34.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.34.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.34.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.34.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.34.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.34.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.35.input_layernorm.weight": "model-00004-of-00004.safetensors",
|
||||
"model.layers.35.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
|
||||
"model.layers.35.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
|
||||
"model.layers.35.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
|
||||
"model.layers.35.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
|
||||
"model.layers.35.self_attn.k_norm.weight": "model-00004-of-00004.safetensors",
|
||||
"model.layers.35.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.35.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
|
||||
"model.layers.35.self_attn.q_norm.weight": "model-00004-of-00004.safetensors",
|
||||
"model.layers.35.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.35.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
||||
"model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.4.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.4.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.5.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.5.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.6.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.6.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.7.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.7.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.8.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.8.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.9.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||
"model.layers.9.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.9.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.9.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.9.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.9.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
||||
"model.layers.9.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
||||
"model.norm.weight": "model-00004-of-00004.safetensors"
|
||||
}
|
||||
}
|
||||
594
slurm/9166912.0.err
Normal file
594
slurm/9166912.0.err
Normal file
File diff suppressed because one or more lines are too long
41654
slurm/9166912.0.out
Normal file
41654
slurm/9166912.0.out
Normal file
File diff suppressed because it is too large
Load Diff
597
slurm/9168614.0.err
Normal file
597
slurm/9168614.0.err
Normal file
File diff suppressed because one or more lines are too long
630
slurm/9168614.0.out
Normal file
630
slurm/9168614.0.out
Normal file
@@ -0,0 +1,630 @@
|
||||
SLURM_JOB_ID = 9168614
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
NODES = batch-block1-3273 batch-block1-3339 batch-block1-3736 batch-block1-3544 batch-block1-1089 batch-block1-3578 batch-block1-1062 batch-block1-3035
|
||||
NODE_RANK = 3
|
||||
GPUS_PER_NODE = 8
|
||||
MASTER_ADDR = batch-block1-3273
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
SLURM_JOB_ID = 9168614
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
SLURM_JOB_ID = 9168614
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
SLURM_JOB_ID = 9168614
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
SLURM_JOB_ID = 9168614
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
SLURM_JOB_ID = 9168614
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
SLURM_JOB_ID = 9168614
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
SLURM_JOB_ID = 9168614
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
NODES = batch-block1-3273 batch-block1-3339 batch-block1-3736 batch-block1-3544 batch-block1-1089 batch-block1-3578 batch-block1-1062 batch-block1-3035
|
||||
NODE_RANK = 7
|
||||
GPUS_PER_NODE = 8
|
||||
NODES = batch-block1-3273 batch-block1-3339 batch-block1-3736 batch-block1-3544 batch-block1-1089 batch-block1-3578 batch-block1-1062 batch-block1-3035
|
||||
NODE_RANK = 6
|
||||
GPUS_PER_NODE = 8
|
||||
NODES = batch-block1-3273 batch-block1-3339 batch-block1-3736 batch-block1-3544 batch-block1-1089 batch-block1-3578 batch-block1-1062 batch-block1-3035
|
||||
NODE_RANK = 0
|
||||
GPUS_PER_NODE = 8
|
||||
NODES = batch-block1-3273 batch-block1-3339 batch-block1-3736 batch-block1-3544 batch-block1-1089 batch-block1-3578 batch-block1-1062 batch-block1-3035
|
||||
NODE_RANK = 1
|
||||
GPUS_PER_NODE = 8
|
||||
NODES = batch-block1-3273 batch-block1-3339 batch-block1-3736 batch-block1-3544 batch-block1-1089 batch-block1-3578 batch-block1-1062 batch-block1-3035
|
||||
NODE_RANK = 4
|
||||
GPUS_PER_NODE = 8
|
||||
NODES = batch-block1-3273 batch-block1-3339 batch-block1-3736 batch-block1-3544 batch-block1-1089 batch-block1-3578 batch-block1-1062 batch-block1-3035
|
||||
NODE_RANK = 5
|
||||
GPUS_PER_NODE = 8
|
||||
MASTER_ADDR = batch-block1-3273
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
MASTER_ADDR = batch-block1-3273
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
NODES = batch-block1-3273 batch-block1-3339 batch-block1-3736 batch-block1-3544 batch-block1-1089 batch-block1-3578 batch-block1-1062 batch-block1-3035
|
||||
NODE_RANK = 2
|
||||
GPUS_PER_NODE = 8
|
||||
MASTER_ADDR = batch-block1-3273
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
MASTER_ADDR = batch-block1-3273
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
MASTER_ADDR = batch-block1-3273
|
||||
MASTER_ADDR = batch-block1-3273
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
MASTER_ADDR = batch-block1-3273
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1Imported prefix tree collator v1
|
||||
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1Imported prefix tree collator v1
|
||||
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1Imported prefix tree collator v1
|
||||
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1Imported prefix tree collator v1
|
||||
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1Imported prefix tree collator v1
|
||||
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
|
||||
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1Imported prefix tree collator v1
|
||||
|
||||
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
[2026-04-13 22:52:44,641] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:52:44,641] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:52:44,641] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:52:44,641] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:52:44,641] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:52:44,641] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:52:44,641] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:52:44,641] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:52:44,823] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:52:44,823] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:52:44,823] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:52:44,823] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:52:44,823] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:52:44,823] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:52:44,823] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:52:44,824] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:52:44,830] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:52:44,830] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:52:44,830] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:52:44,830] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:52:44,830] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:52:44,830] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:52:44,830] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:52:44,831] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:52:45,044] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:52:45,044] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:52:45,044] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:52:45,044] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:52:45,044] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:52:45,044] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:52:45,044] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:52:45,044] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:52:45,505] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:52:45,505] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:52:45,505] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:52:45,505] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:52:45,505] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:52:45,505] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:52:45,505] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:52:45,505] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:52:45,710] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:52:45,710] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:52:45,710] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:52:45,710] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:52:45,710] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:52:45,710] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:52:45,710] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:52:45,711] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:52:46,098] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:52:46,098] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:52:46,098] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:52:46,098] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:52:46,098] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:52:46,098] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:52:46,098] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:52:46,099] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:52:46,114] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:52:46,114] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:52:46,114] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:52:46,114] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:52:46,114] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:52:46,114] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:52:46,114] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:52:46,115] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:52:57,932] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:52:57,932] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:52:57,932] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:52:57,932] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:52:57,932] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:52:57,932] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:52:57,932] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:52:57,932] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:52:58,617] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:52:58,617] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:52:58,617] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:52:58,617] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:52:58,617] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:52:58,617] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:52:58,618] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:52:58,618] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:52:58,976] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:52:58,976] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:52:58,976] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:52:58,976] [INFO] [comm.py:706:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
|
||||
[2026-04-13 22:52:58,976] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:52:58,977] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:52:58,977] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:52:58,976] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:52:58,977] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:52:58,977] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:52:58,977] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:52:58,977] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:52:58,977] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:52:58,977] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:52:58,978] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:52:58,978] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:52:58,978] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:52:59,398] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:52:59,398] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:52:59,398] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:52:59,399] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:52:59,399] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:52:59,399] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:52:59,399] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:52:59,399] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:52:59,650] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:52:59,650] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:52:59,650] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:52:59,650] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:52:59,650] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:52:59,650] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:52:59,650] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:52:59,650] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:52:59,893] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:52:59,894] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:52:59,924] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:52:59,924] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:52:59,924] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:52:59,988] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:52:59,988] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:52:59,988] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:52:59,988] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:52:59,988] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:52:59,988] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:52:59,988] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:52:59,988] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:52:59,988] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:52:59,988] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:52:59,988] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:52:59,989] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:53:00,251] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:53:00,305] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:53:00,315] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:53:00,318] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:53:00,318] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:53:00,320] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:53:00,320] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:53:00,320] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:53:00,321] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:53:00,332] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:53:00,333] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:53:00,333] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:53:00,337] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:53:00,338] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:53:00,338] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:53:00,342] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:53:00,383] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:53:00,383] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:53:00,383] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:53:00,383] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:53:00,383] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:53:00,383] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:53:00,383] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:53:00,384] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:53:00,471] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:53:00,471] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:53:00,471] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:53:00,471] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:53:00,472] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:53:00,472] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:53:00,472] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:53:00,472] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:53:02,594] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:53:02,604] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:53:02,604] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:53:02,627] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:53:02,628] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:53:02,634] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:53:02,634] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:53:02,645] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:53:02,672] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:53:02,672] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:53:02,672] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:53:02,672] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:53:02,675] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:53:02,680] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:53:02,692] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:53:02,693] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:53:05,737] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:53:05,750] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:53:05,779] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:53:05,813] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:53:05,854] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:53:05,858] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:53:05,885] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:53:05,899] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:53:05,899] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:53:05,917] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:53:05,932] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:53:05,949] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:53:05,959] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:53:05,959] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:53:05,961] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:53:10,377] [INFO] [partition_parameters.py:348:__exit__] finished initializing model - num_params = 399, num_elems = 8.19B
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-59_batch-block1-3578Using Prefix Tree collator
|
||||
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-59_batch-block1-3578
|
||||
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-59_batch-block1-3578
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-59_batch-block1-3578
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-59_batch-block1-3578
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-59_batch-block1-3578
|
||||
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-59_batch-block1-3578
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-59_batch-block1-3578
|
||||
Using Prefix Tree collatorUsing Prefix Tree collatorUsing Prefix Tree collator
|
||||
|
||||
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']args.report_to: ['wandb']
|
||||
|
||||
args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-1062args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-1062
|
||||
|
||||
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-1062
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-1062
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-1062
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-1062
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-1062
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-1062
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collatorUsing Prefix Tree collator
|
||||
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collatorUsing Prefix Tree collator
|
||||
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-53-00_batch-block1-3736
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-53-00_batch-block1-3736
|
||||
args.report_to: ['wandb']args.report_to: ['wandb']
|
||||
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-53-00_batch-block1-3736args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-53-00_batch-block1-3736
|
||||
|
||||
args.report_to: ['wandb']args.report_to: ['wandb']
|
||||
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-53-00_batch-block1-3736args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-53-00_batch-block1-3736
|
||||
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-53-00_batch-block1-3736
|
||||
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-53-00_batch-block1-3736
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collatorUsing Prefix Tree collator
|
||||
|
||||
Using Prefix Tree collatorUsing Prefix Tree collator
|
||||
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-53-00_batch-block1-1089
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-53-00_batch-block1-1089
|
||||
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-53-00_batch-block1-1089
|
||||
args.report_to: ['wandb']args.report_to: ['wandb']
|
||||
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-53-00_batch-block1-1089args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-53-00_batch-block1-1089
|
||||
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-53-00_batch-block1-1089
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-53-00_batch-block1-1089
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-53-00_batch-block1-1089
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collatorUsing Prefix Tree collator
|
||||
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-3273
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-3273
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-3273
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-3273
|
||||
Using Prefix Tree collatorargs.report_to: ['wandb']
|
||||
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-3273
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-3273
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-3273
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-3273
|
||||
Using Prefix Tree collatorUsing Prefix Tree collator
|
||||
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']args.report_to: ['wandb']args.report_to: ['wandb']
|
||||
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-59_batch-block1-3339args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-59_batch-block1-3339
|
||||
|
||||
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-59_batch-block1-3339
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-59_batch-block1-3339
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-59_batch-block1-3339
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-59_batch-block1-3339
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-59_batch-block1-3339args.report_to: ['wandb']
|
||||
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-59_batch-block1-3339
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collatorUsing Prefix Tree collator
|
||||
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-3035
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-3035
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-3035
|
||||
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-3035
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-3035
|
||||
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-3035
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-3035
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-3035
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collatorUsing Prefix Tree collatorUsing Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
|
||||
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-57_batch-block1-3544
|
||||
args.report_to: ['wandb']args.report_to: ['wandb']
|
||||
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-57_batch-block1-3544args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-57_batch-block1-3544
|
||||
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-57_batch-block1-3544
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-57_batch-block1-3544
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-57_batch-block1-3544
|
||||
args.report_to: ['wandb']args.report_to: ['wandb']
|
||||
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-57_batch-block1-3544args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-57_batch-block1-3544
|
||||
|
||||
Parameter Offload: Total persistent parameters: 308224 in 145 params
|
||||
{'train_runtime': 4.1111, 'train_samples_per_second': 1875.899, 'train_steps_per_second': 15.568, 'train_loss': 0.0, 'epoch': 8.0}
|
||||
[1;34mwandb[0m:
|
||||
[1;34mwandb[0m: 🚀 View run [33mruns/dev/tw-8node-resume[0m at: [34mhttps://wandb.ai/ligeng-zhu/ThreadWeaver/runs/tw-8node-resume[0m
|
||||
595
slurm/9168619.0.err
Normal file
595
slurm/9168619.0.err
Normal file
File diff suppressed because one or more lines are too long
630
slurm/9168619.0.out
Normal file
630
slurm/9168619.0.out
Normal file
@@ -0,0 +1,630 @@
|
||||
SLURM_JOB_ID = 9168619
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
SLURM_JOB_ID = 9168619
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
SLURM_JOB_ID = 9168619
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
SLURM_JOB_ID = 9168619
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
SLURM_JOB_ID = 9168619
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
SLURM_JOB_ID = 9168619
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
SLURM_JOB_ID = 9168619
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
NODES = batch-block1-3273 batch-block1-3339 batch-block1-3736 batch-block1-3544 batch-block1-1089 batch-block1-3578 batch-block1-1062 batch-block1-3035
|
||||
NODE_RANK = 3
|
||||
GPUS_PER_NODE = 8
|
||||
SLURM_JOB_ID = 9168619
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
NODES = batch-block1-3273 batch-block1-3339 batch-block1-3736 batch-block1-3544 batch-block1-1089 batch-block1-3578 batch-block1-1062 batch-block1-3035
|
||||
NODE_RANK = 7
|
||||
GPUS_PER_NODE = 8
|
||||
NODES = batch-block1-3273 batch-block1-3339 batch-block1-3736 batch-block1-3544 batch-block1-1089 batch-block1-3578 batch-block1-1062 batch-block1-3035
|
||||
NODE_RANK = 0
|
||||
GPUS_PER_NODE = 8
|
||||
NODES = batch-block1-3273 batch-block1-3339 batch-block1-3736 batch-block1-3544 batch-block1-1089 batch-block1-3578 batch-block1-1062 batch-block1-3035
|
||||
NODE_RANK = 6
|
||||
GPUS_PER_NODE = 8
|
||||
NODES = batch-block1-3273 batch-block1-3339 batch-block1-3736 batch-block1-3544 batch-block1-1089 batch-block1-3578 batch-block1-1062 batch-block1-3035
|
||||
NODE_RANK = 4
|
||||
GPUS_PER_NODE = 8
|
||||
NODES = batch-block1-3273 batch-block1-3339 batch-block1-3736 batch-block1-3544 batch-block1-1089 batch-block1-3578 batch-block1-1062 batch-block1-3035
|
||||
NODE_RANK = 5
|
||||
GPUS_PER_NODE = 8
|
||||
NODES = batch-block1-3273 batch-block1-3339 batch-block1-3736 batch-block1-3544 batch-block1-1089 batch-block1-3578 batch-block1-1062 batch-block1-3035
|
||||
NODE_RANK = 1
|
||||
GPUS_PER_NODE = 8
|
||||
MASTER_ADDR = batch-block1-3273
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
NODES = batch-block1-3273 batch-block1-3339 batch-block1-3736 batch-block1-3544 batch-block1-1089 batch-block1-3578 batch-block1-1062 batch-block1-3035
|
||||
NODE_RANK = 2
|
||||
GPUS_PER_NODE = 8
|
||||
MASTER_ADDR = batch-block1-3273
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
MASTER_ADDR = batch-block1-3273
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
MASTER_ADDR = batch-block1-3273
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
MASTER_ADDR = batch-block1-3273
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
MASTER_ADDR = batch-block1-3273
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
MASTER_ADDR = batch-block1-3273
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
MASTER_ADDR = batch-block1-3273
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
[2026-04-13 22:55:05,743] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:55:05,789] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:55:05,789] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:55:05,789] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:55:05,790] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:55:05,790] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:55:05,791] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:55:05,810] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:55:05,826] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:55:05,827] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:55:05,834] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:55:05,834] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:55:05,856] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:55:05,856] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:55:05,858] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:55:05,860] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:55:05,867] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:55:05,875] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:55:05,882] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:55:05,882] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:55:05,883] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:55:05,887] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:55:05,888] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:55:05,890] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:55:05,892] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:55:05,894] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:55:05,896] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:55:05,898] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:55:05,899] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:55:05,903] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:55:05,906] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:55:05,910] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:55:06,046] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:55:06,077] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:55:06,102] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:55:06,128] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:55:06,130] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:55:06,131] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:55:06,192] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:55:06,192] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:55:06,192] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:55:06,199] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:55:06,212] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:55:06,224] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:55:06,225] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:55:06,228] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:55:06,229] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:55:06,237] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:55:06,259] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:55:06,271] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:55:06,290] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:55:06,376] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:55:06,379] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:55:06,383] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:55:06,385] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:55:06,385] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:55:06,641] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:55:06,666] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:55:06,714] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:55:06,736] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:55:06,769] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:55:06,786] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:55:06,791] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:55:06,799] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:55:14,032] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:55:14,042] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:55:14,434] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:55:14,435] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:55:14,438] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:55:14,446] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:55:14,532] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:55:14,533] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:55:14,538] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:55:14,538] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:55:14,541] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:55:14,600] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:55:14,603] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:55:14,615] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:55:14,621] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:55:14,648] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:55:14,665] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:55:14,682] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:55:14,696] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:55:14,718] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:55:14,760] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:55:14,821] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:55:14,830] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:55:14,867] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:55:14,875] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:55:14,879] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:55:14,883] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:55:14,888] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:55:14,894] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:55:14,910] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:55:14,924] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:55:14,924] [INFO] [comm.py:706:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
|
||||
[2026-04-13 22:55:14,943] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:55:14,964] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:55:14,989] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:55:15,005] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:55:15,049] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:55:15,050] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:55:15,101] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:55:15,107] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:55:15,145] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:55:15,188] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:55:15,201] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:55:15,203] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:55:15,217] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:55:15,249] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:55:15,271] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:55:15,295] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:55:15,315] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:55:15,354] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:55:15,383] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:55:15,416] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:55:15,419] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:55:15,441] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:55:15,450] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:55:15,471] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:55:15,491] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:55:15,499] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:55:15,518] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:55:15,524] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:55:15,546] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:55:15,577] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:55:15,579] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:55:15,585] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:55:15,588] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:55:17,410] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:55:17,419] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:55:17,424] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:55:17,440] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:55:17,454] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:55:17,459] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:55:17,491] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:55:17,494] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:55:17,498] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:55:17,542] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:55:17,555] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:55:17,752] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:55:17,754] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:55:17,754] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:55:17,757] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:55:17,762] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:55:17,762] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:55:21,637] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:55:21,748] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:55:21,866] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:55:21,867] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:55:21,870] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:55:21,876] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:55:21,885] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:55:21,900] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:55:21,904] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:55:21,907] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:55:21,908] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:55:21,908] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:55:21,912] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:55:21,921] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:55:22,076] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:55:22,108] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:55:22,295] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:55:26,257] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:55:26,288] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:55:26,302] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:55:26,302] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:55:26,325] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:55:26,337] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:55:26,341] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:55:26,341] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:55:26,346] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:55:26,358] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:55:26,370] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:55:26,371] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:55:26,374] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:55:26,376] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:55:26,389] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:55:26,391] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:55:26,392] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:55:26,395] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:55:26,395] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:55:26,405] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:55:26,409] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:55:26,420] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:55:26,454] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:55:26,457] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:55:26,468] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:55:26,468] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:55:26,485] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:55:26,486] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:55:27,300] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:55:27,321] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:55:31,232] [INFO] [partition_parameters.py:348:__exit__] finished initializing model - num_params = 399, num_elems = 8.19B
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-1062
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3578
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-1062
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-1089
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3339
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-1062
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3035
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-3736
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-3544
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-1062
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-3736
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-1089
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-3544
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-3544
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-3544
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-1089
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3035
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-3544
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-1089
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-1062
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3035
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3339
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3273
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3273
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3578
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3273
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3339
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-1089
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3339
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3035
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-1062
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-3736
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3578
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-1062
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-3736
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3035
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-1089
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3578
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-1062
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3339
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3578
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3035
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-3736
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3339
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3273
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-1089
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-1089
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3544args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-3544
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3578
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-3736
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-3544
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-3578
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-3736
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3339
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3273
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3578
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3736
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3273
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3273
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3339
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3273
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3035
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3035
|
||||
Parameter Offload: Total persistent parameters: 308224 in 145 params
|
||||
{'train_runtime': 3.3092, 'train_samples_per_second': 2330.489, 'train_steps_per_second': 19.34, 'train_loss': 0.0, 'epoch': 8.0}
|
||||
[1;34mwandb[0m:
|
||||
[1;34mwandb[0m: 🚀 View run [33mruns/dev/tw-8node-resume[0m at: [34mhttps://wandb.ai/ligeng-zhu/ThreadWeaver/runs/tw-8node-resume[0m
|
||||
594
slurm/9168624.0.err
Normal file
594
slurm/9168624.0.err
Normal file
File diff suppressed because one or more lines are too long
630
slurm/9168624.0.out
Normal file
630
slurm/9168624.0.out
Normal file
@@ -0,0 +1,630 @@
|
||||
SLURM_JOB_ID = 9168624
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
SLURM_JOB_ID = 9168624
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
NODES = batch-block1-3736 batch-block1-3305 batch-block1-3544 batch-block1-1089 batch-block1-3163 batch-block1-3038 batch-block1-1062 batch-block1-3035
|
||||
NODE_RANK = 4
|
||||
GPUS_PER_NODE = 8
|
||||
NODES = batch-block1-3736 batch-block1-3305 batch-block1-3544 batch-block1-1089 batch-block1-3163 batch-block1-3038 batch-block1-1062 batch-block1-3035
|
||||
NODE_RANK = 5
|
||||
GPUS_PER_NODE = 8
|
||||
MASTER_ADDR = batch-block1-3736
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
MASTER_ADDR = batch-block1-3736
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
SLURM_JOB_ID = 9168624
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
SLURM_JOB_ID = 9168624
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
SLURM_JOB_ID = 9168624
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
SLURM_JOB_ID = 9168624
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
NODES = batch-block1-3736 batch-block1-3305 batch-block1-3544 batch-block1-1089 batch-block1-3163 batch-block1-3038 batch-block1-1062 batch-block1-3035
|
||||
NODE_RANK = 7
|
||||
GPUS_PER_NODE = 8
|
||||
NODES = batch-block1-3736 batch-block1-3305 batch-block1-3544 batch-block1-1089 batch-block1-3163 batch-block1-3038 batch-block1-1062 batch-block1-3035
|
||||
NODE_RANK = 3
|
||||
GPUS_PER_NODE = 8
|
||||
SLURM_JOB_ID = 9168624
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
NODES = batch-block1-3736 batch-block1-3305 batch-block1-3544 batch-block1-1089 batch-block1-3163 batch-block1-3038 batch-block1-1062 batch-block1-3035
|
||||
NODE_RANK = 1
|
||||
GPUS_PER_NODE = 8
|
||||
SLURM_JOB_ID = 9168624
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
MASTER_ADDR = batch-block1-3736
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
NODES = batch-block1-3736 batch-block1-3305 batch-block1-3544 batch-block1-1089 batch-block1-3163 batch-block1-3038 batch-block1-1062 batch-block1-3035
|
||||
NODE_RANK = 2
|
||||
GPUS_PER_NODE = 8
|
||||
MASTER_ADDR = batch-block1-3736
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
NODES = batch-block1-3736 batch-block1-3305 batch-block1-3544 batch-block1-1089 batch-block1-3163 batch-block1-3038 batch-block1-1062 batch-block1-3035
|
||||
NODE_RANK = 0
|
||||
GPUS_PER_NODE = 8
|
||||
MASTER_ADDR = batch-block1-3736
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
NODES = batch-block1-3736 batch-block1-3305 batch-block1-3544 batch-block1-1089 batch-block1-3163 batch-block1-3038 batch-block1-1062 batch-block1-3035
|
||||
NODE_RANK = 6
|
||||
GPUS_PER_NODE = 8
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
MASTER_ADDR = batch-block1-3736
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
MASTER_ADDR = batch-block1-3736
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
MASTER_ADDR = batch-block1-3736
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1Imported prefix tree collator v1Imported prefix tree collator v1
|
||||
|
||||
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1Imported prefix tree collator v1Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
|
||||
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
[2026-04-13 22:57:05,613] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:57:05,613] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:57:05,675] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:57:05,705] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:57:05,869] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:57:05,871] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:57:05,874] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:57:05,876] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:57:05,882] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:57:05,885] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:57:05,902] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:57:05,962] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:57:05,967] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:57:05,997] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:57:06,005] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:57:06,008] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:57:06,011] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:57:06,016] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:57:06,041] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:57:06,048] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:57:06,055] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:57:06,076] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:57:06,078] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:57:06,108] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:57:06,119] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:57:06,122] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:57:06,280] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:57:06,346] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:57:06,401] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:57:06,418] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:57:06,442] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:57:06,456] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:57:06,457] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:57:06,461] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:57:06,468] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:57:06,480] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:57:06,504] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:57:06,506] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:57:06,510] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:57:06,519] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:57:13,815] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:57:13,817] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:57:14,049] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:57:14,097] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:57:14,123] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:57:14,212] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:57:14,298] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:57:14,326] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:57:14,331] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:57:14,339] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:57:14,424] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:57:14,502] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:57:14,508] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:57:14,571] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:57:14,570] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:57:14,576] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:57:14,581] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:57:14,593] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:57:14,603] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:57:14,603] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:57:14,635] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:57:14,657] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:57:14,659] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:57:14,667] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:57:14,690] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:57:14,754] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:57:14,768] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:57:14,800] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:57:14,820] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:57:14,894] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:57:15,006] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:57:15,054] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:57:15,358] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:57:15,377] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:57:15,407] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:57:15,465] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:57:15,467] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:57:15,467] [INFO] [comm.py:706:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
|
||||
[2026-04-13 22:57:15,484] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:57:15,485] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:57:15,501] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:57:15,805] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:57:15,845] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:57:15,845] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:57:15,854] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:57:15,906] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:57:15,910] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:57:15,950] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:57:15,955] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:57:15,979] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:57:15,979] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:57:15,981] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:57:15,981] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:57:15,986] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:57:15,989] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:57:15,992] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:57:16,003] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:57:16,004] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:57:16,009] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:57:16,023] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:57:16,044] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:57:16,050] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:57:16,054] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:57:22,654] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:57:22,662] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:57:22,662] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:57:22,677] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:57:22,680] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:57:22,683] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:57:22,684] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:57:22,684] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:57:22,690] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:57:22,694] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:57:22,710] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:57:22,750] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:57:22,751] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:57:22,753] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:57:22,757] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:57:22,769] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:57:22,775] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:57:25,930] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:57:25,930] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:57:25,930] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:57:25,930] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:57:25,930] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:57:25,930] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:57:25,930] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:57:25,930] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:57:26,065] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:57:26,065] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:57:26,065] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:57:26,065] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:57:26,065] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:57:26,065] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:57:26,065] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:57:26,065] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:57:26,534] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:57:26,534] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:57:26,534] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:57:26,534] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:57:26,534] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:57:26,534] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:57:26,534] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:57:26,535] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:57:26,700] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:57:37,768] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:57:37,768] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:57:37,768] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:57:37,768] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:57:37,768] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:57:37,768] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:57:37,768] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:57:37,769] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:57:37,796] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:57:37,796] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:57:37,796] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:57:37,796] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:57:37,796] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:57:37,796] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:57:37,796] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:57:37,796] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:57:38,359] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:57:38,359] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:57:38,359] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:57:38,359] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:57:38,359] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:57:38,359] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:57:38,359] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:57:38,360] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:57:40,194] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:57:40,194] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:57:40,194] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:57:40,194] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:57:40,194] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:57:40,215] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:57:40,215] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:57:40,215] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:57:40,215] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:57:40,215] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:57:40,215] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:57:40,216] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:57:40,363] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:57:40,363] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:57:40,363] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:57:40,363] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:57:40,364] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:57:47,001] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:57:47,049] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:57:47,061] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:57:47,091] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:57:47,098] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:57:47,098] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:57:47,105] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:57:50,876] [INFO] [partition_parameters.py:348:__exit__] finished initializing model - num_params = 399, num_elems = 8.19B
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
|
||||
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-1089
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-3035
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-1062
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-1089
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-13_batch-block1-3544
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-1062
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-15_batch-block1-3736
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-3544
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-15_batch-block1-3736
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-13_batch-block1-3544
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-15_batch-block1-3736
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-3035
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-15_batch-block1-3736
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-1062
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-1089
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-1089
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-1089
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-1062
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-3544
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-3544
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-1062
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-15_batch-block1-1062
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-15_batch-block1-3736
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-3035
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-1089
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-1062
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-3544
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-3035
|
||||
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-3035
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-3035
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-1089
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-15_batch-block1-3736
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-1089
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-3544
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-3544
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collatorUsing Prefix Tree collator
|
||||
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-15_batch-block1-1062
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-3035
|
||||
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-3035
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-15_batch-block1-3736
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-15_batch-block1-3736
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collatorUsing Prefix Tree collator
|
||||
|
||||
Using Prefix Tree collatorUsing Prefix Tree collator
|
||||
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-37_batch-block1-3163
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-37_batch-block1-3163
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-37_batch-block1-3163
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-37_batch-block1-3163args.report_to: ['wandb']
|
||||
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-37_batch-block1-3163
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-37_batch-block1-3163
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-37_batch-block1-3163
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-37_batch-block1-3163
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collatorUsing Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-38_batch-block1-3305
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-38_batch-block1-3305
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-38_batch-block1-3305args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-38_batch-block1-3305
|
||||
|
||||
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-38_batch-block1-3305
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-38_batch-block1-3305
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-38_batch-block1-3305
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-38_batch-block1-3305
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collatorUsing Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-37_batch-block1-3038
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-37_batch-block1-3038
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-37_batch-block1-3038
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-37_batch-block1-3038
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-37_batch-block1-3038
|
||||
args.report_to: ['wandb']args.report_to: ['wandb']
|
||||
|
||||
args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-37_batch-block1-3038
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-37_batch-block1-3038
|
||||
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-37_batch-block1-3038
|
||||
Parameter Offload: Total persistent parameters: 308224 in 145 params
|
||||
{'train_runtime': 4.1282, 'train_samples_per_second': 1868.144, 'train_steps_per_second': 15.503, 'train_loss': 0.0, 'epoch': 8.0}
|
||||
[1;34mwandb[0m:
|
||||
[1;34mwandb[0m: 🚀 View run [33mruns/dev/tw-8node-resume[0m at: [34mhttps://wandb.ai/ligeng-zhu/ThreadWeaver/runs/tw-8node-resume[0m
|
||||
594
slurm/9168628.0.err
Normal file
594
slurm/9168628.0.err
Normal file
File diff suppressed because one or more lines are too long
630
slurm/9168628.0.out
Normal file
630
slurm/9168628.0.out
Normal file
@@ -0,0 +1,630 @@
|
||||
SLURM_JOB_ID = 9168628
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
NODES = batch-block1-3061 batch-block1-3736 batch-block1-3305 batch-block1-3544 batch-block1-1089 batch-block1-3163 batch-block1-3038 batch-block1-3035
|
||||
NODE_RANK = 0
|
||||
GPUS_PER_NODE = 8
|
||||
MASTER_ADDR = batch-block1-3061
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
SLURM_JOB_ID = 9168628
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
SLURM_JOB_ID = 9168628
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
SLURM_JOB_ID = 9168628
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
SLURM_JOB_ID = 9168628
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
SLURM_JOB_ID = 9168628
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
NODES = batch-block1-3061 batch-block1-3736 batch-block1-3305 batch-block1-3544 batch-block1-1089 batch-block1-3163 batch-block1-3038 batch-block1-3035
|
||||
NODE_RANK = 7
|
||||
GPUS_PER_NODE = 8
|
||||
NODES = batch-block1-3061 batch-block1-3736 batch-block1-3305 batch-block1-3544 batch-block1-1089 batch-block1-3163 batch-block1-3038 batch-block1-3035
|
||||
SLURM_JOB_ID = 9168628
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
NODE_RANK = 2
|
||||
GPUS_PER_NODE = 8
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
NODES = batch-block1-3061 batch-block1-3736 batch-block1-3305 batch-block1-3544 batch-block1-1089 batch-block1-3163 batch-block1-3038 batch-block1-3035
|
||||
NODE_RANK = 6
|
||||
GPUS_PER_NODE = 8
|
||||
NODES = batch-block1-3061 batch-block1-3736 batch-block1-3305 batch-block1-3544 batch-block1-1089 batch-block1-3163 batch-block1-3038 batch-block1-3035
|
||||
NODE_RANK = 3
|
||||
GPUS_PER_NODE = 8
|
||||
NODES = batch-block1-3061 batch-block1-3736 batch-block1-3305 batch-block1-3544 batch-block1-1089 batch-block1-3163 batch-block1-3038 batch-block1-3035
|
||||
NODE_RANK = 4
|
||||
GPUS_PER_NODE = 8
|
||||
MASTER_ADDR = batch-block1-3061
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
MASTER_ADDR = batch-block1-3061
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
MASTER_ADDR = batch-block1-3061
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
NODES = batch-block1-3061 batch-block1-3736 batch-block1-3305 batch-block1-3544 batch-block1-1089 batch-block1-3163 batch-block1-3038 batch-block1-3035
|
||||
NODE_RANK = 5
|
||||
GPUS_PER_NODE = 8
|
||||
MASTER_ADDR = batch-block1-3061
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
MASTER_ADDR = batch-block1-3061
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
MASTER_ADDR = batch-block1-3061
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
SLURM_JOB_ID = 9168628
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
NODES = batch-block1-3061 batch-block1-3736 batch-block1-3305 batch-block1-3544 batch-block1-1089 batch-block1-3163 batch-block1-3038 batch-block1-3035
|
||||
NODE_RANK = 1
|
||||
GPUS_PER_NODE = 8
|
||||
MASTER_ADDR = batch-block1-3061
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1Imported prefix tree collator v1
|
||||
|
||||
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
[2026-04-13 22:59:31,354] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,508] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,522] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,534] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,537] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,571] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,609] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,624] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,732] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,735] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,750] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,765] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,766] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,812] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,824] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,830] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,843] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,848] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,851] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,858] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,863] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,870] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,870] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,880] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,882] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,886] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,888] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,890] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,892] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,894] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,894] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,896] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,900] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,904] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,954] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,956] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,962] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,968] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,970] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:31,978] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:32,119] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:32,126] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:32,136] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:32,172] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:32,178] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:32,183] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:32,197] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:32,205] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:32,207] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:32,210] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:32,211] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:32,295] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:32,309] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:32,309] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:32,311] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:32,314] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:39,924] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,048] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,160] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,170] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,319] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,344] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,387] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,400] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,426] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,432] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,447] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,478] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,482] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,490] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,508] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,545] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,551] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,569] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,571] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,577] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,638] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,638] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,657] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,666] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,671] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,677] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,681] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,682] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,715] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,751] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,753] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,756] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,778] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,812] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,826] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,888] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,912] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,934] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,938] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,941] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,952] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,984] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:40,989] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:41,068] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:41,107] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:41,124] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:41,126] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:41,129] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:41,133] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:41,138] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:41,345] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:41,463] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:41,478] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:41,507] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:41,508] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:41,517] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 22:59:42,784] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:42,797] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:42,811] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:42,813] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:42,824] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:42,827] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:42,828] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:42,842] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:42,883] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:42,935] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:42,947] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:42,949] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:42,954] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:42,962] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:42,962] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:42,968] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:42,971] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:47,499] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:47,501] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:47,501] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:47,540] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:47,553] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:47,561] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:47,562] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:47,587] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:47,590] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:47,594] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:47,595] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:47,597] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:47,599] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:47,610] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:47,612] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:47,613] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:47,696] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:52,018] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:52,018] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:52,018] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:52,018] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:52,018] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:52,018] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:52,018] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:52,018] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 22:59:52,424] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:52,454] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:52,470] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:52,483] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:52,483] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:52,486] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:52,504] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:52,505] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:52,509] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:52,510] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:52,513] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:52,538] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:52,540] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:52,541] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:52,541] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:52,542] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:52,545] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:52,550] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:52,551] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:52,570] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:52,572] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 22:59:52,575] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:00:03,732] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:00:03,732] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:00:03,732] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:00:03,732] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:00:03,732] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:00:03,732] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:00:03,732] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:00:03,732] [INFO] [comm.py:706:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
|
||||
[2026-04-13 23:00:03,732] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:00:05,419] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:00:05,419] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:00:05,419] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:00:05,419] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:00:05,419] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:00:05,419] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:00:05,419] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:00:05,419] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:00:10,135] [INFO] [partition_parameters.py:348:__exit__] finished initializing model - num_params = 399, num_elems = 8.19B
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
|
||||
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-41_batch-block1-3736
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-1089
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-1089
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3038
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-39_batch-block1-3544
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3544
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3305
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3038
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3035
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3163
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3038
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3038
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3163
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3038
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-41_batch-block1-3736
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3035
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3038
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-1089
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-41_batch-block1-3305
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3544
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3163
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3305
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-41_batch-block1-3305
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-41_batch-block1-3736
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-1089
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3035
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3038
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3038
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-41_batch-block1-3736
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3544
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3163
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-1089
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-41_batch-block1-3736
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collatorUsing Prefix Tree collator
|
||||
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-1089
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collatorUsing Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3163
|
||||
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3163
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3163
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3544
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3544
|
||||
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3544
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-41_batch-block1-3736
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3035
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3163
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-41_batch-block1-3736
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3035
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-41_batch-block1-1089
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-41_batch-block1-3736
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3544
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3305
|
||||
Using Prefix Tree collatorUsing Prefix Tree collator
|
||||
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-41_batch-block1-3305
|
||||
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-41_batch-block1-3305
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3035
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3035
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-1089
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3305
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3035
|
||||
Using Prefix Tree collatorUsing Prefix Tree collatorUsing Prefix Tree collatorUsing Prefix Tree collator
|
||||
|
||||
|
||||
|
||||
Using Prefix Tree collatorUsing Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']args.report_to: ['wandb']args.report_to: ['wandb']
|
||||
|
||||
|
||||
args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-00-03_batch-block1-3061args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-00-03_batch-block1-3061args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-00-03_batch-block1-3061
|
||||
|
||||
|
||||
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-00-03_batch-block1-3061
|
||||
args.report_to: ['wandb']args.report_to: ['wandb']
|
||||
|
||||
args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-00-03_batch-block1-3061args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-00-03_batch-block1-3061
|
||||
|
||||
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-00-03_batch-block1-3061
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-00-03_batch-block1-3061
|
||||
Parameter Offload: Total persistent parameters: 308224 in 145 params
|
||||
{'train_runtime': 3.8014, 'train_samples_per_second': 2028.721, 'train_steps_per_second': 16.836, 'train_loss': 0.0, 'epoch': 8.0}
|
||||
[1;34mwandb[0m:
|
||||
[1;34mwandb[0m: 🚀 View run [33mruns/dev/tw-8node-resume[0m at: [34mhttps://wandb.ai/ligeng-zhu/ThreadWeaver/runs/tw-8node-resume[0m
|
||||
594
slurm/9168633.0.err
Normal file
594
slurm/9168633.0.err
Normal file
File diff suppressed because one or more lines are too long
630
slurm/9168633.0.out
Normal file
630
slurm/9168633.0.out
Normal file
@@ -0,0 +1,630 @@
|
||||
SLURM_JOB_ID = 9168633
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
NODES = batch-block1-1015 batch-block1-3297 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3908 batch-block1-3227
|
||||
NODE_RANK = 5
|
||||
GPUS_PER_NODE = 8
|
||||
SLURM_JOB_ID = 9168633
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
MASTER_ADDR = batch-block1-1015
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
SLURM_JOB_ID = 9168633
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
SLURM_JOB_ID = 9168633
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
NODES = batch-block1-1015 batch-block1-3297 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3908 batch-block1-3227
|
||||
NODE_RANK = 3
|
||||
GPUS_PER_NODE = 8
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
NODES = batch-block1-1015 batch-block1-3297 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3908 batch-block1-3227
|
||||
NODE_RANK = 7
|
||||
GPUS_PER_NODE = 8
|
||||
NODES = batch-block1-1015 batch-block1-3297 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3908 batch-block1-3227
|
||||
NODE_RANK = 6
|
||||
GPUS_PER_NODE = 8
|
||||
MASTER_ADDR = batch-block1-1015
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
MASTER_ADDR = batch-block1-1015
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
MASTER_ADDR = batch-block1-1015
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
SLURM_JOB_ID = 9168633
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
NODES = batch-block1-1015 batch-block1-3297 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3908 batch-block1-3227
|
||||
NODE_RANK = 2
|
||||
GPUS_PER_NODE = 8
|
||||
SLURM_JOB_ID = 9168633
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
MASTER_ADDR = batch-block1-1015
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
NODES = batch-block1-1015 batch-block1-3297 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3908 batch-block1-3227
|
||||
NODE_RANK = 1
|
||||
GPUS_PER_NODE = 8
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
SLURM_JOB_ID = 9168633
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
MASTER_ADDR = batch-block1-1015
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
NODES = batch-block1-1015 batch-block1-3297 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3908 batch-block1-3227
|
||||
NODE_RANK = 0
|
||||
GPUS_PER_NODE = 8
|
||||
MASTER_ADDR = batch-block1-1015
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
SLURM_JOB_ID = 9168633
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
NODES = batch-block1-1015 batch-block1-3297 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3908 batch-block1-3227
|
||||
NODE_RANK = 4
|
||||
GPUS_PER_NODE = 8
|
||||
MASTER_ADDR = batch-block1-1015
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
|
||||
Imported prefix tree collator v1Imported prefix tree collator v1
|
||||
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1Imported prefix tree collator v1
|
||||
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1Imported prefix tree collator v1
|
||||
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1Imported prefix tree collator v1
|
||||
|
||||
Imported prefix tree collator v1Imported prefix tree collator v1
|
||||
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1Imported prefix tree collator v1
|
||||
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1Imported prefix tree collator v1
|
||||
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
[2026-04-13 23:03:58,482] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:03:58,482] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:03:58,609] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:03:58,667] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:03:58,672] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:03:58,672] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:03:58,672] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:03:58,672] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:04:06,850] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:04:06,866] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:04:07,134] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:04:07,203] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:04:07,325] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:04:07,329] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:04:07,336] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:04:07,365] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:04:08,644] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:04:08,757] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:04:08,759] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:04:08,761] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:04:08,761] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:04:08,766] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:04:08,769] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:04:08,769] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:04:17,775] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:04:17,775] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:04:17,775] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:04:17,775] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:04:17,775] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:04:17,775] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:04:17,775] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:04:17,775] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:04:17,775] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:04:17,775] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:04:17,775] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:04:17,775] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:04:17,775] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:04:17,775] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:04:17,775] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:04:17,776] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:04:18,422] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:04:18,422] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:04:18,422] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:04:18,422] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:04:18,422] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:04:18,422] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:04:18,422] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:04:18,423] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:04:18,531] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:04:18,531] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:04:18,531] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:04:18,531] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:04:18,531] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:04:18,531] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:04:18,531] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:04:18,531] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:04:18,582] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:04:18,582] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:04:18,582] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:04:18,582] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:04:18,582] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:04:18,582] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:04:18,582] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:04:18,582] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:04:18,773] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:04:18,773] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:04:18,773] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:04:18,773] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:04:18,773] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:04:18,773] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:04:18,773] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:04:18,774] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:04:19,192] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:04:19,192] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:04:19,192] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:04:19,192] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:04:19,192] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:04:19,192] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:04:19,192] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:04:19,193] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:04:29,337] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:04:29,337] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:04:29,337] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:04:29,337] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:04:29,337] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:04:29,337] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:04:29,337] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:04:29,337] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:04:29,376] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:04:29,376] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:04:29,376] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:04:29,376] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:04:29,376] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:04:29,376] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:04:29,376] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:04:29,376] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:04:30,218] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:04:30,218] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:04:30,218] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:04:30,218] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:04:30,218] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:04:30,218] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:04:30,218] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:04:30,218] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:04:30,223] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:04:30,223] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:04:30,223] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:04:30,223] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:04:30,223] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:04:30,224] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:04:30,224] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:04:30,224] [INFO] [comm.py:706:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
|
||||
[2026-04-13 23:04:30,224] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:04:30,348] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:04:30,348] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:04:30,348] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:04:30,348] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:04:30,349] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:04:30,349] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:04:30,349] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:04:30,349] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:04:30,351] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:04:30,351] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:04:30,351] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:04:30,351] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:04:30,351] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:04:30,351] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:04:30,351] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:04:30,351] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:04:31,121] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:04:31,122] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:04:31,122] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:04:31,172] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:04:31,172] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:04:31,172] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:04:31,172] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:04:31,172] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:04:31,210] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:04:31,224] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:04:31,224] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:04:31,224] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:04:31,224] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:04:31,224] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:04:31,224] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:04:31,224] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:04:31,224] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:04:31,227] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:04:31,246] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:04:31,247] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:04:31,358] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:04:31,358] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:04:31,358] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:04:31,359] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:04:31,893] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:04:31,893] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:04:31,893] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:04:31,893] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:04:31,893] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:04:31,894] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:04:31,894] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:04:31,895] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:04:36,779] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:04:36,822] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:04:36,822] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:04:36,850] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:04:36,862] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:04:36,874] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:04:36,896] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:04:36,900] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:04:36,906] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:04:36,908] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:04:36,911] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:04:36,914] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:04:36,923] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:04:36,932] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:04:36,932] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:04:36,953] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:04:36,955] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:04:36,961] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:04:36,969] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:04:36,969] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:04:36,969] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:04:36,974] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:04:36,976] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:04:36,991] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:04:37,042] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:04:37,043] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:04:37,075] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:04:37,097] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:04:37,103] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:04:37,104] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:04:37,117] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:04:37,195] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:04:41,568] [INFO] [partition_parameters.py:348:__exit__] finished initializing model - num_params = 399, num_elems = 8.19B
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
|
||||
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
|
||||
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-07_batch-block1-3273
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-07_batch-block1-3273
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-07_batch-block1-3273
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-06_batch-block1-3273
|
||||
Using Prefix Tree collatorUsing Prefix Tree collator
|
||||
|
||||
args.report_to: ['wandb']args.report_to: ['wandb']
|
||||
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-06_batch-block1-3273args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-07_batch-block1-3273
|
||||
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-07_batch-block1-3273
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-07_batch-block1-3273
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collatorUsing Prefix Tree collator
|
||||
Using Prefix Tree collatorUsing Prefix Tree collator
|
||||
|
||||
|
||||
Using Prefix Tree collatorUsing Prefix Tree collatorUsing Prefix Tree collator
|
||||
|
||||
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-3908
|
||||
args.report_to: ['wandb']args.report_to: ['wandb']args.report_to: ['wandb']
|
||||
|
||||
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-3908args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-3908args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-3908
|
||||
|
||||
|
||||
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collatorUsing Prefix Tree collatorUsing Prefix Tree collator
|
||||
|
||||
Using Prefix Tree collator
|
||||
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-3908
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-3908
|
||||
|
||||
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-3908args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-3908
|
||||
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1027
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1027
|
||||
args.report_to: ['wandb']args.report_to: ['wandb']
|
||||
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1027args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1027
|
||||
|
||||
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1027
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1027
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1027
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1027
|
||||
Using Prefix Tree collatorUsing Prefix Tree collatorUsing Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
|
||||
|
||||
Using Prefix Tree collatorUsing Prefix Tree collatorUsing Prefix Tree collator
|
||||
|
||||
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']args.report_to: ['wandb']
|
||||
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-29_batch-block1-3227args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-29_batch-block1-3227
|
||||
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-29_batch-block1-3227
|
||||
args.report_to: ['wandb']args.report_to: ['wandb']args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-29_batch-block1-3227args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-29_batch-block1-3227args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-29_batch-block1-3227
|
||||
|
||||
|
||||
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-29_batch-block1-3227
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-29_batch-block1-3227
|
||||
Using Prefix Tree collatorUsing Prefix Tree collator
|
||||
|
||||
Using Prefix Tree collatorUsing Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1015
|
||||
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1015
|
||||
args.report_to: ['wandb']args.report_to: ['wandb']
|
||||
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1015args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1015
|
||||
|
||||
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1015
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1015
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1015
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1015
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1134
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1134
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1134
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1134
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1134
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1134
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1134
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1134
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collatorUsing Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
|
||||
Using Prefix Tree collatorUsing Prefix Tree collator
|
||||
|
||||
Using Prefix Tree collatorUsing Prefix Tree collator
|
||||
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-31_batch-block1-3848args.report_to: ['wandb']
|
||||
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-31_batch-block1-3848
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-31_batch-block1-3848
|
||||
args.report_to: ['wandb']args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-31_batch-block1-3848
|
||||
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-31_batch-block1-3848
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-31_batch-block1-3848
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-31_batch-block1-3848args.report_to: ['wandb']
|
||||
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-31_batch-block1-3848
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collatorUsing Prefix Tree collatorUsing Prefix Tree collatorUsing Prefix Tree collator
|
||||
|
||||
|
||||
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collatorUsing Prefix Tree collator
|
||||
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-29_batch-block1-3297
|
||||
args.report_to: ['wandb']args.report_to: ['wandb']
|
||||
|
||||
args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-29_batch-block1-3297args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-29_batch-block1-3297args.report_to: ['wandb']
|
||||
|
||||
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-29_batch-block1-3297
|
||||
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-29_batch-block1-3297
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-29_batch-block1-3297
|
||||
args.report_to: ['wandb']args.report_to: ['wandb']
|
||||
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-29_batch-block1-3297args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-29_batch-block1-3297
|
||||
|
||||
Parameter Offload: Total persistent parameters: 308224 in 145 params
|
||||
{'train_runtime': 3.8131, 'train_samples_per_second': 2022.522, 'train_steps_per_second': 16.784, 'train_loss': 0.0, 'epoch': 8.0}
|
||||
[1;34mwandb[0m:
|
||||
[1;34mwandb[0m: 🚀 View run [33mruns/dev/tw-8node-resume[0m at: [34mhttps://wandb.ai/ligeng-zhu/ThreadWeaver/runs/tw-8node-resume[0m
|
||||
594
slurm/9168640.0.err
Normal file
594
slurm/9168640.0.err
Normal file
File diff suppressed because one or more lines are too long
630
slurm/9168640.0.out
Normal file
630
slurm/9168640.0.out
Normal file
@@ -0,0 +1,630 @@
|
||||
SLURM_JOB_ID = 9168640
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
SLURM_JOB_ID = 9168640
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
SLURM_JOB_ID = 9168640
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
SLURM_JOB_ID = 9168640
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
SLURM_JOB_ID = 9168640
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
SLURM_JOB_ID = 9168640
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
NODES = batch-block1-1015 batch-block1-3297 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3908 batch-block1-3227
|
||||
NODE_RANK = 3
|
||||
GPUS_PER_NODE = 8
|
||||
NODES = batch-block1-1015 batch-block1-3297 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3908 batch-block1-3227
|
||||
NODE_RANK = 7
|
||||
GPUS_PER_NODE = 8
|
||||
NODES = batch-block1-1015 batch-block1-3297 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3908 batch-block1-3227
|
||||
NODE_RANK = 4
|
||||
GPUS_PER_NODE = 8
|
||||
SLURM_JOB_ID = 9168640
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
NODES = batch-block1-1015 batch-block1-3297 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3908 batch-block1-3227
|
||||
NODE_RANK = 6
|
||||
GPUS_PER_NODE = 8
|
||||
NODES = batch-block1-1015 batch-block1-3297 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3908 batch-block1-3227
|
||||
NODE_RANK = 5
|
||||
GPUS_PER_NODE = 8
|
||||
NODES = batch-block1-1015 batch-block1-3297 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3908 batch-block1-3227
|
||||
NODE_RANK = 1
|
||||
GPUS_PER_NODE = 8
|
||||
MASTER_ADDR = batch-block1-1015
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
MASTER_ADDR = batch-block1-1015
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
MASTER_ADDR = batch-block1-1015
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
NODES = batch-block1-1015 batch-block1-3297 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3908 batch-block1-3227
|
||||
NODE_RANK = 2
|
||||
GPUS_PER_NODE = 8
|
||||
MASTER_ADDR = batch-block1-1015
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
MASTER_ADDR = batch-block1-1015
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
MASTER_ADDR = batch-block1-1015
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
MASTER_ADDR = batch-block1-1015
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
SLURM_JOB_ID = 9168640
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
NODES = batch-block1-1015 batch-block1-3297 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3908 batch-block1-3227
|
||||
NODE_RANK = 0
|
||||
GPUS_PER_NODE = 8
|
||||
MASTER_ADDR = batch-block1-1015
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
[2026-04-13 23:06:06,690] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,691] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,738] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,739] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,745] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,746] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,753] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,753] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,806] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,811] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,827] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,828] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,837] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,837] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,840] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,843] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,844] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,849] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,849] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,851] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,853] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,860] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,865] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,870] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,880] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,890] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,892] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,895] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,896] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,898] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,898] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,919] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,927] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,927] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,932] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,931] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,944] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,953] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,960] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,963] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,968] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,973] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,974] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,976] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:06,991] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:07,006] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:07,009] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:07,010] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:07,010] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:07,010] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:07,016] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:07,036] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:07,037] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:07,039] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:07,040] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:07,040] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:07,041] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:07,077] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:07,077] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:07,081] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:07,104] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:07,107] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:07,109] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:07,124] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:06:14,863] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:14,928] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:14,998] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:15,030] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:15,096] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:15,122] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:15,201] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:15,223] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:15,228] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:15,260] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:15,282] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:15,287] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:15,300] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:15,314] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:15,331] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:15,339] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:15,344] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:15,389] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:15,404] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:15,432] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:15,438] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:15,484] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:15,522] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:15,559] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:15,610] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:15,657] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:15,670] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:15,676] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:15,809] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:15,823] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:15,887] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:15,912] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:15,919] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:15,931] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:15,935] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:15,963] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:15,974] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:16,071] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:16,073] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:16,074] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:16,102] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:16,116] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:16,116] [INFO] [comm.py:706:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
|
||||
[2026-04-13 23:06:16,209] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:16,210] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:16,230] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:16,307] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:16,315] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:16,317] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:16,373] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:16,380] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:16,381] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:16,386] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:16,406] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:16,420] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:16,463] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:16,464] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:16,467] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:16,573] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:16,578] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:16,583] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:16,613] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:16,613] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:16,620] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:16,627] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:16,628] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:16,631] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:16,648] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:16,667] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:16,669] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:16,689] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:16,696] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:16,720] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:16,723] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:16,724] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:16,750] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:16,763] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:16,778] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:16,800] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:16,827] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:16,839] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:16,844] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:16,858] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:16,886] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:06:16,892] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:16,893] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:16,896] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:16,898] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:16,904] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:16,910] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:16,911] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:16,913] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:16,922] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:16,951] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:16,952] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:16,954] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:17,285] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:17,331] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:17,427] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:17,444] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:17,446] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:17,456] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:17,464] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:23,267] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:23,284] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:23,306] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:23,330] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:23,359] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:23,360] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:23,367] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:23,373] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:23,374] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:23,383] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:23,373] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:23,446] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:23,509] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:23,510] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:23,520] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:23,537] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:27,059] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:27,080] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:27,101] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:27,149] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:27,154] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:27,158] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:27,171] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:27,171] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:27,194] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:27,195] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:06:31,119] [INFO] [partition_parameters.py:348:__exit__] finished initializing model - num_params = 399, num_elems = 8.19B
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-3297
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-1134
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-1134
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-3848
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-1134
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-3848
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-3848
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-14_batch-block1-1027
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-1027
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-1134
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-3227
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-1015
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-3273
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-3848
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-3227
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-3297
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-3273
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-3273
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-3297
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-3227
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-1027
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-3848
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-1027
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-1015
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-1134
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-3297
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-3273
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-3848
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-3227
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-3848
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-3273
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-3227
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-1015
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-3908
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-3297
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collatorargs.report_to: ['wandb']
|
||||
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-3908
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-1134
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-3908
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-1015
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-3908
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-3908
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-3273
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-1134
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-1015
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-1015
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-3227
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-3273
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-1027
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-3297
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-3848
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-1027
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-1015
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-3908
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-1027
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-1015
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-3297
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-14_batch-block1-1027
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-3908
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-3273
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-3227
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-3227
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-3297
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-3908
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-14_batch-block1-1134
|
||||
Parameter Offload: Total persistent parameters: 308224 in 145 params
|
||||
{'train_runtime': 3.1226, 'train_samples_per_second': 2469.703, 'train_steps_per_second': 20.495, 'train_loss': 0.0, 'epoch': 8.0}
|
||||
[1;34mwandb[0m:
|
||||
[1;34mwandb[0m: 🚀 View run [33mruns/dev/tw-8node-resume[0m at: [34mhttps://wandb.ai/ligeng-zhu/ThreadWeaver/runs/tw-8node-resume[0m
|
||||
594
slurm/9168643.0.err
Normal file
594
slurm/9168643.0.err
Normal file
File diff suppressed because one or more lines are too long
630
slurm/9168643.0.out
Normal file
630
slurm/9168643.0.out
Normal file
@@ -0,0 +1,630 @@
|
||||
SLURM_JOB_ID = 9168643
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
NODES = batch-block1-1015 batch-block1-3297 batch-block1-3887 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3227
|
||||
NODE_RANK = 2
|
||||
GPUS_PER_NODE = 8
|
||||
MASTER_ADDR = batch-block1-1015
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
SLURM_JOB_ID = 9168643
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
SLURM_JOB_ID = 9168643
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
SLURM_JOB_ID = 9168643
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
SLURM_JOB_ID = 9168643
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
SLURM_JOB_ID = 9168643
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
SLURM_JOB_ID = 9168643
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
NODES = batch-block1-1015 batch-block1-3297 batch-block1-3887 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3227
|
||||
NODE_RANK = 6
|
||||
GPUS_PER_NODE = 8
|
||||
SLURM_JOB_ID = 9168643
|
||||
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
|
||||
RUN_NAME = tw-8node-resume
|
||||
OUTPUT_DIR = runs/dev/tw-8node-resume
|
||||
NNODES = 8
|
||||
NODES = batch-block1-1015 batch-block1-3297 batch-block1-3887 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3227
|
||||
NODE_RANK = 4
|
||||
GPUS_PER_NODE = 8
|
||||
NODES = batch-block1-1015 batch-block1-3297 batch-block1-3887 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3227
|
||||
NODE_RANK = 5
|
||||
GPUS_PER_NODE = 8
|
||||
NODES = batch-block1-1015 batch-block1-3297 batch-block1-3887 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3227
|
||||
NODE_RANK = 1
|
||||
GPUS_PER_NODE = 8
|
||||
NODES = batch-block1-1015 batch-block1-3297 batch-block1-3887 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3227
|
||||
NODE_RANK = 7
|
||||
GPUS_PER_NODE = 8
|
||||
NODES = batch-block1-1015 batch-block1-3297 batch-block1-3887 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3227
|
||||
NODE_RANK = 3
|
||||
GPUS_PER_NODE = 8
|
||||
MASTER_ADDR = batch-block1-1015
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
NODES = batch-block1-1015 batch-block1-3297 batch-block1-3887 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3227
|
||||
NODE_RANK = 0
|
||||
GPUS_PER_NODE = 8
|
||||
MASTER_ADDR = batch-block1-1015
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
MASTER_ADDR = batch-block1-1015
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
MASTER_ADDR = batch-block1-1015
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
MASTER_ADDR = batch-block1-1015
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
MASTER_ADDR = batch-block1-1015
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
MASTER_ADDR = batch-block1-1015
|
||||
MASTER_PORT = 25001
|
||||
GLOBAL_TRAIN_BATCH_SIZE =
|
||||
GRADIENT_ACCUMULATION_STEPS =
|
||||
PER_DEVICE_TRAIN_BATCH_SIZE =
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1Imported prefix tree collator v1
|
||||
|
||||
Imported prefix tree collator v1Imported prefix tree collator v1
|
||||
|
||||
Imported prefix tree collator v1Imported prefix tree collator v1
|
||||
|
||||
Imported prefix tree collator v1
|
||||
Imported prefix tree collator v1
|
||||
[2026-04-13 23:07:56,557] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:56,558] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:56,616] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:56,617] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:56,643] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:56,643] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:56,731] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:56,747] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:56,760] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:56,762] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:56,785] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:56,787] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:56,787] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:56,792] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:56,800] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:56,806] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:56,920] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:56,927] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:56,955] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,003] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,028] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,036] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,071] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,082] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,082] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,083] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,083] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,083] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,084] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,085] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,085] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,086] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,087] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,088] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,092] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,093] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,098] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,120] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,122] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,125] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,140] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,142] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,147] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,163] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,167] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,170] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,172] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,175] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,175] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,179] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,185] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,204] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,217] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,219] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,230] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:07:57,237] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:08:05,010] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,025] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,053] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,070] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,071] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,113] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,253] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,279] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,373] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,378] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,432] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,464] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,500] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,548] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,609] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,613] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,629] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,656] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,660] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,691] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,693] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,693] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,694] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,740] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,782] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,795] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,806] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,823] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,831] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,841] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,845] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,847] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,850] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,859] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,859] [INFO] [comm.py:706:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
|
||||
[2026-04-13 23:08:05,881] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,893] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,948] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,965] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,970] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:05,989] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:06,001] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:06,007] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:06,011] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:06,013] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:06,039] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:06,078] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:06,118] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:06,128] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:06,129] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:06,131] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:06,132] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:06,145] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:06,193] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:06,197] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:06,240] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:06,257] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:06,371] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:06,391] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:06,431] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:06,444] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:06,492] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:06,500] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:06,506] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:06,510] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:06,514] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:06,518] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:06,518] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:06,518] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:06,520] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:06,523] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:06,528] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:06,529] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:06,540] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:06,546] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:06,550] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:06,551] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:06,555] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:06,579] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:06,609] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:06,630] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:06,645] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:06,697] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:06,714] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:06,714] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:06,732] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:06,735] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:06,740] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:06,740] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:06,755] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:13,160] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:13,184] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:13,184] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:13,185] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:13,224] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:13,239] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:13,240] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:13,243] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:13,243] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:13,245] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:13,254] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:13,264] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:13,272] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:13,291] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:13,452] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:13,457] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:17,477] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:17,531] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:17,547] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:17,548] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:17,558] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:17,645] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:17,648] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:18,097] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:08:18,097] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:08:18,097] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:08:18,097] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:08:18,097] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:08:18,097] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:08:18,097] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:08:18,098] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||
[2026-04-13 23:08:30,097] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:30,098] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:30,098] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:30,099] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:30,109] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:30,110] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:30,110] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:30,110] [INFO] [comm.py:675:init_distributed] cdb=None
|
||||
[2026-04-13 23:08:32,677] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:32,677] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:32,677] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:32,677] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:32,677] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:32,677] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:32,677] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:32,677] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
|
||||
[2026-04-13 23:08:36,881] [INFO] [partition_parameters.py:348:__exit__] finished initializing model - num_params = 399, num_elems = 8.19B
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3273
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1134
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-3297
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3848
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3848
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1134
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-1134
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1015
|
||||
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1027
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3273
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-1015
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-3297
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-1015
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3848
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3848
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3273
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-3297
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1134
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3273
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-1015
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3273
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3273
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3273
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-3297
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-1134
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3848
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-1015
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3848
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1134
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3227
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3297
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1027
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3297
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3848
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3227
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-1015
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1027
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3273
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3227
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3227
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-3297
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1027
|
||||
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1027
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1027
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3227
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3227
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-3297
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3848
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3227
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1027
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1015
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1027
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-1015
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1134
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-1134
|
||||
Using Prefix Tree collator
|
||||
Using Prefix Tree collatorUsing Prefix Tree collator
|
||||
Using Prefix Tree collatorUsing Prefix Tree collator
|
||||
|
||||
|
||||
Using Prefix Tree collatorUsing Prefix Tree collatorUsing Prefix Tree collator
|
||||
|
||||
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-30_batch-block1-3887
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-30_batch-block1-3887
|
||||
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-30_batch-block1-3887
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-30_batch-block1-3887
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-30_batch-block1-3887
|
||||
args.report_to: ['wandb']
|
||||
args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-30_batch-block1-3887
|
||||
|
||||
args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-30_batch-block1-3887
|
||||
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-30_batch-block1-3887
|
||||
Using Prefix Tree collator
|
||||
args.report_to: ['wandb']
|
||||
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3227
|
||||
Parameter Offload: Total persistent parameters: 308224 in 145 params
|
||||
{'train_runtime': 3.1214, 'train_samples_per_second': 2470.663, 'train_steps_per_second': 20.503, 'train_loss': 0.0, 'epoch': 8.0}
|
||||
[1;34mwandb[0m:
|
||||
[1;34mwandb[0m: 🚀 View run [33mruns/dev/tw-8node-resume[0m at: [34mhttps://wandb.ai/ligeng-zhu/ThreadWeaver/runs/tw-8node-resume[0m
|
||||
124
special_tokens_map.json
Normal file
124
special_tokens_map.json
Normal file
@@ -0,0 +1,124 @@
|
||||
{
|
||||
"additional_special_tokens": [
|
||||
{
|
||||
"content": "<Think>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
{
|
||||
"content": "</Think>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
{
|
||||
"content": "<Parallel>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
{
|
||||
"content": "</Parallel>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
{
|
||||
"content": "<Outlines>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
{
|
||||
"content": "</Outlines>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
{
|
||||
"content": "<Outline>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
{
|
||||
"content": "</Outline>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
{
|
||||
"content": "<Trial>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
{
|
||||
"content": "</Trial>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
{
|
||||
"content": "<Subtask>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
{
|
||||
"content": "</Subtask>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
{
|
||||
"content": "<Thread>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
{
|
||||
"content": "</Thread>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
{
|
||||
"content": "<Conclusion>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
{
|
||||
"content": "</Conclusion>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
}
|
||||
],
|
||||
"eos_token": {
|
||||
"content": "<|im_end|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
"pad_token": "<|fim_pad|>"
|
||||
}
|
||||
3
tokenizer.json
Normal file
3
tokenizer.json
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:0233e510f0f3cba54c9223bf44453616eb93c1219f688a05efb426e217cee80a
|
||||
size 11425637
|
||||
371
tokenizer_config.json
Normal file
371
tokenizer_config.json
Normal file
@@ -0,0 +1,371 @@
|
||||
{
|
||||
"add_bos_token": false,
|
||||
"add_prefix_space": false,
|
||||
"added_tokens_decoder": {
|
||||
"151643": {
|
||||
"content": "<|endoftext|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"151644": {
|
||||
"content": "<|im_start|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"151645": {
|
||||
"content": "<|im_end|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"151646": {
|
||||
"content": "<|object_ref_start|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"151647": {
|
||||
"content": "<|object_ref_end|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"151648": {
|
||||
"content": "<|box_start|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"151649": {
|
||||
"content": "<|box_end|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"151650": {
|
||||
"content": "<|quad_start|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"151651": {
|
||||
"content": "<|quad_end|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"151652": {
|
||||
"content": "<|vision_start|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"151653": {
|
||||
"content": "<|vision_end|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"151654": {
|
||||
"content": "<|vision_pad|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"151655": {
|
||||
"content": "<|image_pad|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"151656": {
|
||||
"content": "<|video_pad|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"151657": {
|
||||
"content": "<tool_call>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": false
|
||||
},
|
||||
"151658": {
|
||||
"content": "</tool_call>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": false
|
||||
},
|
||||
"151659": {
|
||||
"content": "<|fim_prefix|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": false
|
||||
},
|
||||
"151660": {
|
||||
"content": "<|fim_middle|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": false
|
||||
},
|
||||
"151661": {
|
||||
"content": "<|fim_suffix|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": false
|
||||
},
|
||||
"151662": {
|
||||
"content": "<|fim_pad|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"151663": {
|
||||
"content": "<|repo_name|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": false
|
||||
},
|
||||
"151664": {
|
||||
"content": "<|file_sep|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": false
|
||||
},
|
||||
"151665": {
|
||||
"content": "<tool_response>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": false
|
||||
},
|
||||
"151666": {
|
||||
"content": "</tool_response>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": false
|
||||
},
|
||||
"151667": {
|
||||
"content": "<think>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": false
|
||||
},
|
||||
"151668": {
|
||||
"content": "</think>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": false
|
||||
},
|
||||
"151669": {
|
||||
"content": "<Think>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"151670": {
|
||||
"content": "</Think>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"151671": {
|
||||
"content": "<Parallel>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"151672": {
|
||||
"content": "</Parallel>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"151673": {
|
||||
"content": "<Outlines>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"151674": {
|
||||
"content": "</Outlines>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"151675": {
|
||||
"content": "<Outline>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"151676": {
|
||||
"content": "</Outline>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"151677": {
|
||||
"content": "<Trial>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"151678": {
|
||||
"content": "</Trial>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"151679": {
|
||||
"content": "<Subtask>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"151680": {
|
||||
"content": "</Subtask>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"151681": {
|
||||
"content": "<Thread>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"151682": {
|
||||
"content": "</Thread>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"151683": {
|
||||
"content": "<Conclusion>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"151684": {
|
||||
"content": "</Conclusion>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
}
|
||||
},
|
||||
"additional_special_tokens": [
|
||||
"<Think>",
|
||||
"</Think>",
|
||||
"<Parallel>",
|
||||
"</Parallel>",
|
||||
"<Outlines>",
|
||||
"</Outlines>",
|
||||
"<Outline>",
|
||||
"</Outline>",
|
||||
"<Trial>",
|
||||
"</Trial>",
|
||||
"<Subtask>",
|
||||
"</Subtask>",
|
||||
"<Thread>",
|
||||
"</Thread>",
|
||||
"<Conclusion>",
|
||||
"</Conclusion>"
|
||||
],
|
||||
"bos_token": null,
|
||||
"chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0].role == 'system' %}\n {{- messages[0].content + '\\n\\n' }}\n {%- endif %}\n {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0].role == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0].content + '<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n{%- for message in messages[::-1] %}\n {%- set index = (messages|length - 1) - loop.index0 %}\n {%- if ns.multi_step_tool and message.role == \"user\" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}\n {%- set ns.multi_step_tool = false %}\n {%- set ns.last_query_index = index %}\n {%- endif %}\n{%- endfor %}\n{%- for message in messages %}\n {%- if message.content is string %}\n {%- set content = message.content %}\n {%- else %}\n {%- set content = '' %}\n {%- endif %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n {{- '<|im_start|>' + message.role + '\\n' + content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {%- set reasoning_content = '' %}\n {%- if message.reasoning_content is string %}\n {%- set reasoning_content = message.reasoning_content %}\n {%- else %}\n {%- if '</think>' in content %}\n {%- set reasoning_content = content.split('</think>')[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') %}\n {%- set content = content.split('</think>')[-1].lstrip('\\n') %}\n {%- endif %}\n {%- endif %}\n {%- if loop.index0 > ns.last_query_index %}\n {%- if loop.last or (not loop.last and reasoning_content) %}\n {{- '<|im_start|>' + message.role + '\\n<think>\\n' + reasoning_content.strip('\\n') + '\\n</think>\\n\\n' + content.lstrip('\\n') }}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- if message.tool_calls %}\n {%- for tool_call in message.tool_calls %}\n {%- if (loop.first and content) or (not loop.first) %}\n {{- '\\n' }}\n {%- endif %}\n {%- if tool_call.function %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {%- if tool_call.arguments is string %}\n {{- tool_call.arguments }}\n {%- else %}\n {{- tool_call.arguments | tojson }}\n {%- endif %}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {%- endif %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n {%- if enable_thinking is defined and enable_thinking is false %}\n {{- '<think>\\n\\n</think>\\n\\n' }}\n {%- endif %}\n{%- endif %}",
|
||||
"clean_up_tokenization_spaces": false,
|
||||
"eos_token": "<|im_end|>",
|
||||
"errors": "replace",
|
||||
"extra_special_tokens": {},
|
||||
"model_max_length": 131072,
|
||||
"pad_token": "<|fim_pad|>",
|
||||
"split_special_tokens": false,
|
||||
"tokenizer_class": "Qwen2Tokenizer",
|
||||
"unk_token": null
|
||||
}
|
||||
3
training_args.bin
Normal file
3
training_args.bin
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:9ac6b9f2dd0653e4ac399a58c1b8c8a35aabd786bd251f673080a5b3f944d985
|
||||
size 7672
|
||||
1
vocab.json
Normal file
1
vocab.json
Normal file
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user