初始化项目,由ModelHub XC社区提供模型

Model: ligeng-dev/q3-8b-train_final_v2_nb2_mt8192_replaced_fix
Source: Original Platform
This commit is contained in:
ModelHub XC
2026-04-21 00:40:58 +08:00
commit 90ba3302c7
32 changed files with 203309 additions and 0 deletions

36
.gitattributes vendored Normal file
View File

@@ -0,0 +1,36 @@
*.7z filter=lfs diff=lfs merge=lfs -text
*.arrow filter=lfs diff=lfs merge=lfs -text
*.bin filter=lfs diff=lfs merge=lfs -text
*.bz2 filter=lfs diff=lfs merge=lfs -text
*.ckpt filter=lfs diff=lfs merge=lfs -text
*.ftz filter=lfs diff=lfs merge=lfs -text
*.gz filter=lfs diff=lfs merge=lfs -text
*.h5 filter=lfs diff=lfs merge=lfs -text
*.joblib filter=lfs diff=lfs merge=lfs -text
*.lfs.* filter=lfs diff=lfs merge=lfs -text
*.mlmodel filter=lfs diff=lfs merge=lfs -text
*.model filter=lfs diff=lfs merge=lfs -text
*.msgpack filter=lfs diff=lfs merge=lfs -text
*.npy filter=lfs diff=lfs merge=lfs -text
*.npz filter=lfs diff=lfs merge=lfs -text
*.onnx filter=lfs diff=lfs merge=lfs -text
*.ot filter=lfs diff=lfs merge=lfs -text
*.parquet filter=lfs diff=lfs merge=lfs -text
*.pb filter=lfs diff=lfs merge=lfs -text
*.pickle filter=lfs diff=lfs merge=lfs -text
*.pkl filter=lfs diff=lfs merge=lfs -text
*.pt filter=lfs diff=lfs merge=lfs -text
*.pth filter=lfs diff=lfs merge=lfs -text
*.rar filter=lfs diff=lfs merge=lfs -text
*.safetensors filter=lfs diff=lfs merge=lfs -text
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*.tar.* filter=lfs diff=lfs merge=lfs -text
*.tar filter=lfs diff=lfs merge=lfs -text
*.tflite filter=lfs diff=lfs merge=lfs -text
*.tgz filter=lfs diff=lfs merge=lfs -text
*.wasm filter=lfs diff=lfs merge=lfs -text
*.xz filter=lfs diff=lfs merge=lfs -text
*.zip filter=lfs diff=lfs merge=lfs -text
*.zst filter=lfs diff=lfs merge=lfs -text
*tfevents* filter=lfs diff=lfs merge=lfs -text
tokenizer.json filter=lfs diff=lfs merge=lfs -text

58
README.md Normal file
View File

@@ -0,0 +1,58 @@
---
base_model: Qwen/Qwen3-8B
library_name: transformers
model_name: tw-8node-resume
tags:
- generated_from_trainer
- trl
- sft
licence: license
---
# Model Card for tw-8node-resume
This model is a fine-tuned version of [Qwen/Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B).
It has been trained using [TRL](https://github.com/huggingface/trl).
## Quick start
```python
from transformers import pipeline
question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
generator = pipeline("text-generation", model="None", device="cuda")
output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
print(output["generated_text"])
```
## Training procedure
[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/ligeng-zhu/ThreadWeaver/runs/tw-8node-resume)
This model was trained with SFT.
### Framework versions
- TRL: 0.19.0
- Transformers: 4.51.1
- Pytorch: 2.6.0
- Datasets: 3.6.0
- Tokenizers: 0.21.1
## Citations
Cite TRL as:
```bibtex
@misc{vonwerra2022trl,
title = {{TRL: Transformer Reinforcement Learning}},
author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec},
year = 2020,
journal = {GitHub repository},
publisher = {GitHub},
howpublished = {\url{https://github.com/huggingface/trl}}
}
```

44
added_tokens.json Normal file
View File

@@ -0,0 +1,44 @@
{
"</Conclusion>": 151684,
"</Outline>": 151676,
"</Outlines>": 151674,
"</Parallel>": 151672,
"</Subtask>": 151680,
"</Think>": 151670,
"</Thread>": 151682,
"</Trial>": 151678,
"</think>": 151668,
"</tool_call>": 151658,
"</tool_response>": 151666,
"<Conclusion>": 151683,
"<Outline>": 151675,
"<Outlines>": 151673,
"<Parallel>": 151671,
"<Subtask>": 151679,
"<Think>": 151669,
"<Thread>": 151681,
"<Trial>": 151677,
"<think>": 151667,
"<tool_call>": 151657,
"<tool_response>": 151665,
"<|box_end|>": 151649,
"<|box_start|>": 151648,
"<|endoftext|>": 151643,
"<|file_sep|>": 151664,
"<|fim_middle|>": 151660,
"<|fim_pad|>": 151662,
"<|fim_prefix|>": 151659,
"<|fim_suffix|>": 151661,
"<|im_end|>": 151645,
"<|im_start|>": 151644,
"<|image_pad|>": 151655,
"<|object_ref_end|>": 151647,
"<|object_ref_start|>": 151646,
"<|quad_end|>": 151651,
"<|quad_start|>": 151650,
"<|repo_name|>": 151663,
"<|video_pad|>": 151656,
"<|vision_end|>": 151653,
"<|vision_pad|>": 151654,
"<|vision_start|>": 151652
}

30
config.json Normal file
View File

@@ -0,0 +1,30 @@
{
"architectures": [
"Qwen3ForCausalLM"
],
"attention_bias": false,
"attention_dropout": 0.0,
"bos_token_id": 151643,
"eos_token_id": 151645,
"head_dim": 128,
"hidden_act": "silu",
"hidden_size": 4096,
"initializer_range": 0.02,
"intermediate_size": 12288,
"max_position_embeddings": 40960,
"max_window_layers": 36,
"model_type": "qwen3",
"num_attention_heads": 32,
"num_hidden_layers": 36,
"num_key_value_heads": 8,
"rms_norm_eps": 1e-06,
"rope_scaling": null,
"rope_theta": 1000000,
"sliding_window": null,
"tie_word_embeddings": false,
"torch_dtype": "bfloat16",
"transformers_version": "4.51.1",
"use_cache": true,
"use_sliding_window": false,
"vocab_size": 151744
}

13
generation_config.json Normal file
View File

@@ -0,0 +1,13 @@
{
"bos_token_id": 151643,
"do_sample": true,
"eos_token_id": [
151645,
151643
],
"pad_token_id": 151643,
"temperature": 0.6,
"top_k": 20,
"top_p": 0.95,
"transformers_version": "4.51.1"
}

151388
merges.txt Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:c7518f9abec80fc05f085a7ca1624f27603d691322eeba2030c1a894912e54a3
size 4900684832

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:32cb63f1f06c832ec8a5df1f6c6c6a1ecc1e285bbed2690e39d1d9debb78aa39
size 4915960368

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:ac906db1972bd385dfab4fe08ec3037fcc758ad6046c5196122405b0f6e0fd92
size 4983068496

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:4fc7d83d34247e2ecbb46c070115e6e4356a21365fbe400356a5dbec1b816432
size 1578657400

View File

@@ -0,0 +1,406 @@
{
"metadata": {
"total_size": 16378324992
},
"weight_map": {
"lm_head.weight": "model-00004-of-00004.safetensors",
"model.embed_tokens.weight": "model-00001-of-00004.safetensors",
"model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
"model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
"model.layers.0.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
"model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.0.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
"model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
"model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
"model.layers.1.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
"model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.1.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
"model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
"model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
"model.layers.10.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
"model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.10.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
"model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
"model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
"model.layers.11.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
"model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.11.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
"model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
"model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
"model.layers.12.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
"model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.12.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
"model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
"model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
"model.layers.13.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
"model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.13.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
"model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
"model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
"model.layers.14.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
"model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.14.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
"model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
"model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
"model.layers.15.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
"model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.15.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
"model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
"model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
"model.layers.16.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
"model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.16.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
"model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
"model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
"model.layers.17.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
"model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.17.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
"model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors",
"model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
"model.layers.18.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
"model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.18.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
"model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors",
"model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
"model.layers.19.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
"model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.19.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
"model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
"model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
"model.layers.2.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
"model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.2.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
"model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.20.input_layernorm.weight": "model-00002-of-00004.safetensors",
"model.layers.20.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.20.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.20.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
"model.layers.20.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
"model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.20.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
"model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.21.input_layernorm.weight": "model-00002-of-00004.safetensors",
"model.layers.21.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.21.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.21.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.21.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
"model.layers.21.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
"model.layers.21.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.21.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.21.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
"model.layers.21.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.21.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
"model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
"model.layers.22.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
"model.layers.22.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.22.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.22.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
"model.layers.22.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.22.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
"model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
"model.layers.23.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
"model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.23.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
"model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
"model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
"model.layers.24.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
"model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.24.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
"model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
"model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
"model.layers.25.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
"model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.25.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
"model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
"model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
"model.layers.26.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
"model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.26.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
"model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
"model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
"model.layers.27.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
"model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.27.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
"model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors",
"model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
"model.layers.28.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
"model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.28.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
"model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors",
"model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
"model.layers.29.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
"model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.29.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
"model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
"model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
"model.layers.3.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
"model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.3.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
"model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors",
"model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
"model.layers.30.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
"model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.30.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
"model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.31.input_layernorm.weight": "model-00003-of-00004.safetensors",
"model.layers.31.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.31.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
"model.layers.31.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
"model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.31.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
"model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.32.input_layernorm.weight": "model-00003-of-00004.safetensors",
"model.layers.32.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.32.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.32.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.32.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
"model.layers.32.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
"model.layers.32.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.32.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.32.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
"model.layers.32.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.32.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.33.input_layernorm.weight": "model-00003-of-00004.safetensors",
"model.layers.33.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.33.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.33.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.33.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
"model.layers.33.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
"model.layers.33.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.33.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.33.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
"model.layers.33.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.33.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.34.input_layernorm.weight": "model-00003-of-00004.safetensors",
"model.layers.34.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.34.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.34.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.34.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
"model.layers.34.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
"model.layers.34.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.34.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.34.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
"model.layers.34.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.34.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.35.input_layernorm.weight": "model-00004-of-00004.safetensors",
"model.layers.35.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
"model.layers.35.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
"model.layers.35.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
"model.layers.35.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
"model.layers.35.self_attn.k_norm.weight": "model-00004-of-00004.safetensors",
"model.layers.35.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.35.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
"model.layers.35.self_attn.q_norm.weight": "model-00004-of-00004.safetensors",
"model.layers.35.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.35.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
"model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
"model.layers.4.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
"model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.4.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
"model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
"model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
"model.layers.5.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
"model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.5.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
"model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
"model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
"model.layers.6.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
"model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.6.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
"model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors",
"model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
"model.layers.7.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
"model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.7.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
"model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors",
"model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
"model.layers.8.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
"model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.8.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
"model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
"model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.9.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
"model.layers.9.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
"model.layers.9.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.9.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.9.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
"model.layers.9.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.9.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
"model.norm.weight": "model-00004-of-00004.safetensors"
}
}

594
slurm/9166912.0.err Normal file

File diff suppressed because one or more lines are too long

41654
slurm/9166912.0.out Normal file

File diff suppressed because it is too large Load Diff

597
slurm/9168614.0.err Normal file

File diff suppressed because one or more lines are too long

630
slurm/9168614.0.out Normal file
View File

@@ -0,0 +1,630 @@
SLURM_JOB_ID = 9168614
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
RUN_NAME = tw-8node-resume
OUTPUT_DIR = runs/dev/tw-8node-resume
NNODES = 8
NODES = batch-block1-3273 batch-block1-3339 batch-block1-3736 batch-block1-3544 batch-block1-1089 batch-block1-3578 batch-block1-1062 batch-block1-3035
NODE_RANK = 3
GPUS_PER_NODE = 8
MASTER_ADDR = batch-block1-3273
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE =
GRADIENT_ACCUMULATION_STEPS =
PER_DEVICE_TRAIN_BATCH_SIZE =
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
SLURM_JOB_ID = 9168614
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
RUN_NAME = tw-8node-resume
OUTPUT_DIR = runs/dev/tw-8node-resume
NNODES = 8
SLURM_JOB_ID = 9168614
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
RUN_NAME = tw-8node-resume
OUTPUT_DIR = runs/dev/tw-8node-resume
NNODES = 8
SLURM_JOB_ID = 9168614
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
RUN_NAME = tw-8node-resume
OUTPUT_DIR = runs/dev/tw-8node-resume
NNODES = 8
SLURM_JOB_ID = 9168614
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
RUN_NAME = tw-8node-resume
OUTPUT_DIR = runs/dev/tw-8node-resume
NNODES = 8
SLURM_JOB_ID = 9168614
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
RUN_NAME = tw-8node-resume
OUTPUT_DIR = runs/dev/tw-8node-resume
NNODES = 8
SLURM_JOB_ID = 9168614
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
RUN_NAME = tw-8node-resume
OUTPUT_DIR = runs/dev/tw-8node-resume
NNODES = 8
SLURM_JOB_ID = 9168614
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
RUN_NAME = tw-8node-resume
OUTPUT_DIR = runs/dev/tw-8node-resume
NNODES = 8
NODES = batch-block1-3273 batch-block1-3339 batch-block1-3736 batch-block1-3544 batch-block1-1089 batch-block1-3578 batch-block1-1062 batch-block1-3035
NODE_RANK = 7
GPUS_PER_NODE = 8
NODES = batch-block1-3273 batch-block1-3339 batch-block1-3736 batch-block1-3544 batch-block1-1089 batch-block1-3578 batch-block1-1062 batch-block1-3035
NODE_RANK = 6
GPUS_PER_NODE = 8
NODES = batch-block1-3273 batch-block1-3339 batch-block1-3736 batch-block1-3544 batch-block1-1089 batch-block1-3578 batch-block1-1062 batch-block1-3035
NODE_RANK = 0
GPUS_PER_NODE = 8
NODES = batch-block1-3273 batch-block1-3339 batch-block1-3736 batch-block1-3544 batch-block1-1089 batch-block1-3578 batch-block1-1062 batch-block1-3035
NODE_RANK = 1
GPUS_PER_NODE = 8
NODES = batch-block1-3273 batch-block1-3339 batch-block1-3736 batch-block1-3544 batch-block1-1089 batch-block1-3578 batch-block1-1062 batch-block1-3035
NODE_RANK = 4
GPUS_PER_NODE = 8
NODES = batch-block1-3273 batch-block1-3339 batch-block1-3736 batch-block1-3544 batch-block1-1089 batch-block1-3578 batch-block1-1062 batch-block1-3035
NODE_RANK = 5
GPUS_PER_NODE = 8
MASTER_ADDR = batch-block1-3273
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE =
GRADIENT_ACCUMULATION_STEPS =
PER_DEVICE_TRAIN_BATCH_SIZE =
MASTER_ADDR = batch-block1-3273
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE =
GRADIENT_ACCUMULATION_STEPS =
PER_DEVICE_TRAIN_BATCH_SIZE =
NODES = batch-block1-3273 batch-block1-3339 batch-block1-3736 batch-block1-3544 batch-block1-1089 batch-block1-3578 batch-block1-1062 batch-block1-3035
NODE_RANK = 2
GPUS_PER_NODE = 8
MASTER_ADDR = batch-block1-3273
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE =
GRADIENT_ACCUMULATION_STEPS =
PER_DEVICE_TRAIN_BATCH_SIZE =
MASTER_ADDR = batch-block1-3273
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE =
GRADIENT_ACCUMULATION_STEPS =
PER_DEVICE_TRAIN_BATCH_SIZE =
MASTER_ADDR = batch-block1-3273
MASTER_ADDR = batch-block1-3273
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE =
GRADIENT_ACCUMULATION_STEPS =
PER_DEVICE_TRAIN_BATCH_SIZE =
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE =
GRADIENT_ACCUMULATION_STEPS =
PER_DEVICE_TRAIN_BATCH_SIZE =
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
MASTER_ADDR = batch-block1-3273
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE =
GRADIENT_ACCUMULATION_STEPS =
PER_DEVICE_TRAIN_BATCH_SIZE =
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1Imported prefix tree collator v1
Imported prefix tree collator v1Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1Imported prefix tree collator v1
Imported prefix tree collator v1Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
[2026-04-13 22:52:44,641] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:44,641] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:44,641] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:44,641] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:44,641] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:44,641] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:44,641] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:44,641] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:44,823] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:44,823] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:44,823] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:44,823] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:44,823] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:44,823] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:44,823] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:44,824] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:44,830] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:44,830] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:44,830] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:44,830] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:44,830] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:44,830] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:44,830] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:44,831] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:45,044] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:45,044] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:45,044] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:45,044] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:45,044] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:45,044] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:45,044] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:45,044] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:45,505] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:45,505] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:45,505] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:45,505] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:45,505] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:45,505] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:45,505] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:45,505] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:45,710] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:45,710] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:45,710] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:45,710] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:45,710] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:45,710] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:45,710] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:45,711] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:46,098] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:46,098] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:46,098] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:46,098] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:46,098] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:46,098] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:46,098] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:46,099] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:46,114] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:46,114] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:46,114] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:46,114] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:46,114] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:46,114] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:46,114] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:46,115] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:52:57,932] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:57,932] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:57,932] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:57,932] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:57,932] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:57,932] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:57,932] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:57,932] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:58,617] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:58,617] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:58,617] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:58,617] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:58,617] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:58,617] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:58,618] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:58,618] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:58,976] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:58,976] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:58,976] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:58,976] [INFO] [comm.py:706:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
[2026-04-13 22:52:58,976] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:58,977] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:58,977] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:58,976] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:58,977] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:58,977] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:58,977] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:58,977] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:58,977] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:58,977] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:58,978] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:58,978] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:58,978] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:59,398] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:59,398] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:59,398] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:59,399] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:59,399] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:59,399] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:59,399] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:59,399] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:59,650] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:59,650] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:59,650] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:59,650] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:59,650] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:59,650] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:59,650] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:59,650] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:52:59,893] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:52:59,894] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:52:59,924] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:52:59,924] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:52:59,924] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:52:59,988] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:52:59,988] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:52:59,988] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:52:59,988] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:52:59,988] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:52:59,988] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:52:59,988] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:52:59,988] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:52:59,988] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:52:59,988] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:52:59,988] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:52:59,989] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:00,251] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:00,305] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:00,315] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:00,318] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:00,318] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:00,320] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:00,320] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:00,320] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:00,321] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:00,332] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:00,333] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:00,333] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:00,337] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:00,338] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:00,338] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:00,342] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:00,383] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:53:00,383] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:53:00,383] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:53:00,383] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:53:00,383] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:53:00,383] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:53:00,383] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:53:00,384] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:53:00,471] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:53:00,471] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:53:00,471] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:53:00,471] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:53:00,472] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:53:00,472] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:53:00,472] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:53:00,472] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:53:02,594] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:02,604] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:02,604] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:02,627] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:02,628] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:02,634] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:02,634] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:02,645] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:02,672] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:02,672] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:02,672] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:02,672] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:02,675] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:02,680] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:02,692] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:02,693] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:05,737] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:05,750] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:05,779] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:05,813] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:05,854] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:05,858] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:05,885] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:05,899] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:05,899] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:05,917] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:05,932] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:05,949] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:05,959] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:05,959] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:05,961] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:53:10,377] [INFO] [partition_parameters.py:348:__exit__] finished initializing model - num_params = 399, num_elems = 8.19B
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-59_batch-block1-3578Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']Using Prefix Tree collator
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-59_batch-block1-3578
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-59_batch-block1-3578
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-59_batch-block1-3578
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-59_batch-block1-3578
args.report_to: ['wandb']
args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-59_batch-block1-3578
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-59_batch-block1-3578
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-59_batch-block1-3578
Using Prefix Tree collatorUsing Prefix Tree collatorUsing Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']args.report_to: ['wandb']
args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-1062args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-1062
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-1062
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-1062
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-1062
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-1062
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-1062
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-1062
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collatorUsing Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collatorUsing Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-53-00_batch-block1-3736
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-53-00_batch-block1-3736
args.report_to: ['wandb']args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-53-00_batch-block1-3736args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-53-00_batch-block1-3736
args.report_to: ['wandb']args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-53-00_batch-block1-3736args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-53-00_batch-block1-3736
args.report_to: ['wandb']
args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-53-00_batch-block1-3736
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-53-00_batch-block1-3736
Using Prefix Tree collator
Using Prefix Tree collatorUsing Prefix Tree collator
Using Prefix Tree collatorUsing Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-53-00_batch-block1-1089
args.report_to: ['wandb']
args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-53-00_batch-block1-1089
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-53-00_batch-block1-1089
args.report_to: ['wandb']args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-53-00_batch-block1-1089args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-53-00_batch-block1-1089
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-53-00_batch-block1-1089
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-53-00_batch-block1-1089
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-53-00_batch-block1-1089
Using Prefix Tree collator
Using Prefix Tree collatorUsing Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-3273
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-3273
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-3273
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-3273
Using Prefix Tree collatorargs.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-3273
Using Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-3273
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-3273
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-3273
Using Prefix Tree collatorUsing Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']args.report_to: ['wandb']args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-59_batch-block1-3339args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-59_batch-block1-3339
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-59_batch-block1-3339
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-59_batch-block1-3339
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-59_batch-block1-3339
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-59_batch-block1-3339
Using Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-59_batch-block1-3339args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-59_batch-block1-3339
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collatorUsing Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-3035
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-3035
args.report_to: ['wandb']
args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-3035
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-3035
Using Prefix Tree collator
args.report_to: ['wandb']Using Prefix Tree collator
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-3035
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-3035
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-3035
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-58_batch-block1-3035
Using Prefix Tree collator
Using Prefix Tree collatorUsing Prefix Tree collatorUsing Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-57_batch-block1-3544
args.report_to: ['wandb']args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-57_batch-block1-3544args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-57_batch-block1-3544
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-57_batch-block1-3544
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-57_batch-block1-3544
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-57_batch-block1-3544
args.report_to: ['wandb']args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-57_batch-block1-3544args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-52-57_batch-block1-3544
Parameter Offload: Total persistent parameters: 308224 in 145 params
{'train_runtime': 4.1111, 'train_samples_per_second': 1875.899, 'train_steps_per_second': 15.568, 'train_loss': 0.0, 'epoch': 8.0}
wandb:
wandb: 🚀 View run runs/dev/tw-8node-resume at: https://wandb.ai/ligeng-zhu/ThreadWeaver/runs/tw-8node-resume

595
slurm/9168619.0.err Normal file

File diff suppressed because one or more lines are too long

630
slurm/9168619.0.out Normal file
View File

@@ -0,0 +1,630 @@
SLURM_JOB_ID = 9168619
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
RUN_NAME = tw-8node-resume
OUTPUT_DIR = runs/dev/tw-8node-resume
NNODES = 8
SLURM_JOB_ID = 9168619
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
RUN_NAME = tw-8node-resume
OUTPUT_DIR = runs/dev/tw-8node-resume
NNODES = 8
SLURM_JOB_ID = 9168619
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
RUN_NAME = tw-8node-resume
OUTPUT_DIR = runs/dev/tw-8node-resume
NNODES = 8
SLURM_JOB_ID = 9168619
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
RUN_NAME = tw-8node-resume
OUTPUT_DIR = runs/dev/tw-8node-resume
NNODES = 8
SLURM_JOB_ID = 9168619
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
RUN_NAME = tw-8node-resume
OUTPUT_DIR = runs/dev/tw-8node-resume
NNODES = 8
SLURM_JOB_ID = 9168619
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
RUN_NAME = tw-8node-resume
OUTPUT_DIR = runs/dev/tw-8node-resume
NNODES = 8
SLURM_JOB_ID = 9168619
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
RUN_NAME = tw-8node-resume
OUTPUT_DIR = runs/dev/tw-8node-resume
NNODES = 8
NODES = batch-block1-3273 batch-block1-3339 batch-block1-3736 batch-block1-3544 batch-block1-1089 batch-block1-3578 batch-block1-1062 batch-block1-3035
NODE_RANK = 3
GPUS_PER_NODE = 8
SLURM_JOB_ID = 9168619
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
RUN_NAME = tw-8node-resume
OUTPUT_DIR = runs/dev/tw-8node-resume
NNODES = 8
NODES = batch-block1-3273 batch-block1-3339 batch-block1-3736 batch-block1-3544 batch-block1-1089 batch-block1-3578 batch-block1-1062 batch-block1-3035
NODE_RANK = 7
GPUS_PER_NODE = 8
NODES = batch-block1-3273 batch-block1-3339 batch-block1-3736 batch-block1-3544 batch-block1-1089 batch-block1-3578 batch-block1-1062 batch-block1-3035
NODE_RANK = 0
GPUS_PER_NODE = 8
NODES = batch-block1-3273 batch-block1-3339 batch-block1-3736 batch-block1-3544 batch-block1-1089 batch-block1-3578 batch-block1-1062 batch-block1-3035
NODE_RANK = 6
GPUS_PER_NODE = 8
NODES = batch-block1-3273 batch-block1-3339 batch-block1-3736 batch-block1-3544 batch-block1-1089 batch-block1-3578 batch-block1-1062 batch-block1-3035
NODE_RANK = 4
GPUS_PER_NODE = 8
NODES = batch-block1-3273 batch-block1-3339 batch-block1-3736 batch-block1-3544 batch-block1-1089 batch-block1-3578 batch-block1-1062 batch-block1-3035
NODE_RANK = 5
GPUS_PER_NODE = 8
NODES = batch-block1-3273 batch-block1-3339 batch-block1-3736 batch-block1-3544 batch-block1-1089 batch-block1-3578 batch-block1-1062 batch-block1-3035
NODE_RANK = 1
GPUS_PER_NODE = 8
MASTER_ADDR = batch-block1-3273
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE =
GRADIENT_ACCUMULATION_STEPS =
PER_DEVICE_TRAIN_BATCH_SIZE =
NODES = batch-block1-3273 batch-block1-3339 batch-block1-3736 batch-block1-3544 batch-block1-1089 batch-block1-3578 batch-block1-1062 batch-block1-3035
NODE_RANK = 2
GPUS_PER_NODE = 8
MASTER_ADDR = batch-block1-3273
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE =
GRADIENT_ACCUMULATION_STEPS =
PER_DEVICE_TRAIN_BATCH_SIZE =
MASTER_ADDR = batch-block1-3273
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE =
GRADIENT_ACCUMULATION_STEPS =
PER_DEVICE_TRAIN_BATCH_SIZE =
MASTER_ADDR = batch-block1-3273
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE =
GRADIENT_ACCUMULATION_STEPS =
PER_DEVICE_TRAIN_BATCH_SIZE =
MASTER_ADDR = batch-block1-3273
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE =
GRADIENT_ACCUMULATION_STEPS =
PER_DEVICE_TRAIN_BATCH_SIZE =
MASTER_ADDR = batch-block1-3273
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE =
GRADIENT_ACCUMULATION_STEPS =
PER_DEVICE_TRAIN_BATCH_SIZE =
MASTER_ADDR = batch-block1-3273
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE =
GRADIENT_ACCUMULATION_STEPS =
PER_DEVICE_TRAIN_BATCH_SIZE =
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
MASTER_ADDR = batch-block1-3273
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE =
GRADIENT_ACCUMULATION_STEPS =
PER_DEVICE_TRAIN_BATCH_SIZE =
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
[2026-04-13 22:55:05,743] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:55:05,789] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:55:05,789] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:55:05,789] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:55:05,790] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:55:05,790] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:55:05,791] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:55:05,810] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:55:05,826] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:55:05,827] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:55:05,834] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:55:05,834] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:55:05,856] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:55:05,856] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:55:05,858] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:55:05,860] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:55:05,867] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:55:05,875] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:55:05,882] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:55:05,882] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:55:05,883] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:55:05,887] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:55:05,888] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:55:05,890] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:55:05,892] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:55:05,894] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:55:05,896] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:55:05,898] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:55:05,899] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:55:05,903] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:55:05,906] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:55:05,910] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:55:06,046] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:55:06,077] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:55:06,102] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:55:06,128] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:55:06,130] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:55:06,131] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:55:06,192] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:55:06,192] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:55:06,192] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:55:06,199] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:55:06,212] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:55:06,224] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:55:06,225] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:55:06,228] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:55:06,229] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:55:06,237] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:55:06,259] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:55:06,271] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:55:06,290] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:55:06,376] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:55:06,379] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:55:06,383] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:55:06,385] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:55:06,385] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:55:06,641] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:55:06,666] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:55:06,714] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:55:06,736] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:55:06,769] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:55:06,786] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:55:06,791] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:55:06,799] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:55:14,032] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:55:14,042] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:55:14,434] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:55:14,435] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:55:14,438] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:55:14,446] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:55:14,532] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:55:14,533] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:55:14,538] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:55:14,538] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:55:14,541] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:55:14,600] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:55:14,603] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:55:14,615] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:55:14,621] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:55:14,648] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:55:14,665] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:55:14,682] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:55:14,696] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:55:14,718] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:55:14,760] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:55:14,821] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:55:14,830] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:55:14,867] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:55:14,875] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:55:14,879] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:55:14,883] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:55:14,888] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:55:14,894] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:55:14,910] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:55:14,924] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:55:14,924] [INFO] [comm.py:706:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
[2026-04-13 22:55:14,943] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:55:14,964] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:55:14,989] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:55:15,005] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:55:15,049] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:55:15,050] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:55:15,101] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:55:15,107] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:55:15,145] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:55:15,188] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:55:15,201] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:55:15,203] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:55:15,217] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:55:15,249] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:55:15,271] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:55:15,295] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:55:15,315] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:55:15,354] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:55:15,383] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:55:15,416] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:55:15,419] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:55:15,441] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:55:15,450] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:55:15,471] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:55:15,491] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:55:15,499] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:55:15,518] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:55:15,524] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:55:15,546] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:55:15,577] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:55:15,579] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:55:15,585] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:55:15,588] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:55:17,410] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:55:17,419] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:55:17,424] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:55:17,440] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:55:17,454] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:55:17,459] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:55:17,491] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:55:17,494] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:55:17,498] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:55:17,542] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:55:17,555] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:55:17,752] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:55:17,754] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:55:17,754] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:55:17,757] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:55:17,762] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:55:17,762] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:55:21,637] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:55:21,748] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:55:21,866] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:55:21,867] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:55:21,870] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:55:21,876] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:55:21,885] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:55:21,900] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:55:21,904] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:55:21,907] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:55:21,908] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:55:21,908] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:55:21,912] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:55:21,921] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:55:22,076] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:55:22,108] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:55:22,295] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:55:26,257] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:55:26,288] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:55:26,302] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:55:26,302] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:55:26,325] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:55:26,337] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:55:26,341] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:55:26,341] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:55:26,346] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:55:26,358] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:55:26,370] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:55:26,371] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:55:26,374] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:55:26,376] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:55:26,389] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:55:26,391] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:55:26,392] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:55:26,395] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:55:26,395] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:55:26,405] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:55:26,409] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:55:26,420] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:55:26,454] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:55:26,457] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:55:26,468] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:55:26,468] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:55:26,485] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:55:26,486] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:55:27,300] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:55:27,321] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:55:31,232] [INFO] [partition_parameters.py:348:__exit__] finished initializing model - num_params = 399, num_elems = 8.19B
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-1062
Using Prefix Tree collator
args.report_to: ['wandb']
Using Prefix Tree collator
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3578
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-1062
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-1089
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3339
Using Prefix Tree collator
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-1062
Using Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']
Using Prefix Tree collator
Using Prefix Tree collator
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3035
Using Prefix Tree collator
args.report_to: ['wandb']
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-3736
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-3544
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-1062
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-3736
Using Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-1089
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-3544
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-3544
args.report_to: ['wandb']
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-3544
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-1089
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3035
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-3544
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-1089
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-1062
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']
Using Prefix Tree collator
args.report_to: ['wandb']
Using Prefix Tree collator
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3035
Using Prefix Tree collator
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3339
Using Prefix Tree collator
args.report_to: ['wandb']
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3273
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3273
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3578
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3273
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3339
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-1089
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3339
args.report_to: ['wandb']
Using Prefix Tree collator
Using Prefix Tree collator
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3035
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-1062
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-3736
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3578
args.report_to: ['wandb']
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-1062
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-3736
Using Prefix Tree collator
args.report_to: ['wandb']
Using Prefix Tree collator
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3035
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-1089
Using Prefix Tree collator
args.report_to: ['wandb']
args.report_to: ['wandb']
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3578
Using Prefix Tree collator
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-1062
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3339
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3578
args.report_to: ['wandb']
Using Prefix Tree collator
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3035
args.report_to: ['wandb']
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-3736
Using Prefix Tree collator
args.report_to: ['wandb']
args.report_to: ['wandb']
args.report_to: ['wandb']
Using Prefix Tree collator
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3339
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3273
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-1089
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-1089
Using Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3544args.report_to: ['wandb']
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-3544
Using Prefix Tree collator
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3578
Using Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-3736
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-3544
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-3578
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-15_batch-block1-3736
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3339
args.report_to: ['wandb']
Using Prefix Tree collator
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3273
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3578
args.report_to: ['wandb']
Using Prefix Tree collator
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3736
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3273
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3273
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3339
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3273
Using Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3035
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-55-14_batch-block1-3035
Parameter Offload: Total persistent parameters: 308224 in 145 params
{'train_runtime': 3.3092, 'train_samples_per_second': 2330.489, 'train_steps_per_second': 19.34, 'train_loss': 0.0, 'epoch': 8.0}
wandb:
wandb: 🚀 View run runs/dev/tw-8node-resume at: https://wandb.ai/ligeng-zhu/ThreadWeaver/runs/tw-8node-resume

594
slurm/9168624.0.err Normal file

File diff suppressed because one or more lines are too long

630
slurm/9168624.0.out Normal file
View File

@@ -0,0 +1,630 @@
SLURM_JOB_ID = 9168624
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
RUN_NAME = tw-8node-resume
OUTPUT_DIR = runs/dev/tw-8node-resume
NNODES = 8
SLURM_JOB_ID = 9168624
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
RUN_NAME = tw-8node-resume
OUTPUT_DIR = runs/dev/tw-8node-resume
NNODES = 8
NODES = batch-block1-3736 batch-block1-3305 batch-block1-3544 batch-block1-1089 batch-block1-3163 batch-block1-3038 batch-block1-1062 batch-block1-3035
NODE_RANK = 4
GPUS_PER_NODE = 8
NODES = batch-block1-3736 batch-block1-3305 batch-block1-3544 batch-block1-1089 batch-block1-3163 batch-block1-3038 batch-block1-1062 batch-block1-3035
NODE_RANK = 5
GPUS_PER_NODE = 8
MASTER_ADDR = batch-block1-3736
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE =
GRADIENT_ACCUMULATION_STEPS =
PER_DEVICE_TRAIN_BATCH_SIZE =
MASTER_ADDR = batch-block1-3736
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE =
GRADIENT_ACCUMULATION_STEPS =
PER_DEVICE_TRAIN_BATCH_SIZE =
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
SLURM_JOB_ID = 9168624
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
RUN_NAME = tw-8node-resume
OUTPUT_DIR = runs/dev/tw-8node-resume
NNODES = 8
SLURM_JOB_ID = 9168624
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
RUN_NAME = tw-8node-resume
OUTPUT_DIR = runs/dev/tw-8node-resume
NNODES = 8
SLURM_JOB_ID = 9168624
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
RUN_NAME = tw-8node-resume
OUTPUT_DIR = runs/dev/tw-8node-resume
NNODES = 8
SLURM_JOB_ID = 9168624
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
RUN_NAME = tw-8node-resume
OUTPUT_DIR = runs/dev/tw-8node-resume
NNODES = 8
NODES = batch-block1-3736 batch-block1-3305 batch-block1-3544 batch-block1-1089 batch-block1-3163 batch-block1-3038 batch-block1-1062 batch-block1-3035
NODE_RANK = 7
GPUS_PER_NODE = 8
NODES = batch-block1-3736 batch-block1-3305 batch-block1-3544 batch-block1-1089 batch-block1-3163 batch-block1-3038 batch-block1-1062 batch-block1-3035
NODE_RANK = 3
GPUS_PER_NODE = 8
SLURM_JOB_ID = 9168624
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
RUN_NAME = tw-8node-resume
OUTPUT_DIR = runs/dev/tw-8node-resume
NNODES = 8
NODES = batch-block1-3736 batch-block1-3305 batch-block1-3544 batch-block1-1089 batch-block1-3163 batch-block1-3038 batch-block1-1062 batch-block1-3035
NODE_RANK = 1
GPUS_PER_NODE = 8
SLURM_JOB_ID = 9168624
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
RUN_NAME = tw-8node-resume
OUTPUT_DIR = runs/dev/tw-8node-resume
NNODES = 8
MASTER_ADDR = batch-block1-3736
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE =
GRADIENT_ACCUMULATION_STEPS =
PER_DEVICE_TRAIN_BATCH_SIZE =
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
NODES = batch-block1-3736 batch-block1-3305 batch-block1-3544 batch-block1-1089 batch-block1-3163 batch-block1-3038 batch-block1-1062 batch-block1-3035
NODE_RANK = 2
GPUS_PER_NODE = 8
MASTER_ADDR = batch-block1-3736
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE =
GRADIENT_ACCUMULATION_STEPS =
PER_DEVICE_TRAIN_BATCH_SIZE =
NODES = batch-block1-3736 batch-block1-3305 batch-block1-3544 batch-block1-1089 batch-block1-3163 batch-block1-3038 batch-block1-1062 batch-block1-3035
NODE_RANK = 0
GPUS_PER_NODE = 8
MASTER_ADDR = batch-block1-3736
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE =
GRADIENT_ACCUMULATION_STEPS =
PER_DEVICE_TRAIN_BATCH_SIZE =
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
NODES = batch-block1-3736 batch-block1-3305 batch-block1-3544 batch-block1-1089 batch-block1-3163 batch-block1-3038 batch-block1-1062 batch-block1-3035
NODE_RANK = 6
GPUS_PER_NODE = 8
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
MASTER_ADDR = batch-block1-3736
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE =
GRADIENT_ACCUMULATION_STEPS =
PER_DEVICE_TRAIN_BATCH_SIZE =
MASTER_ADDR = batch-block1-3736
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE =
GRADIENT_ACCUMULATION_STEPS =
PER_DEVICE_TRAIN_BATCH_SIZE =
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
MASTER_ADDR = batch-block1-3736
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE =
GRADIENT_ACCUMULATION_STEPS =
PER_DEVICE_TRAIN_BATCH_SIZE =
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1Imported prefix tree collator v1Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1Imported prefix tree collator v1Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
[2026-04-13 22:57:05,613] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:57:05,613] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:57:05,675] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:57:05,705] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:57:05,869] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:57:05,871] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:57:05,874] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:57:05,876] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:57:05,882] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:57:05,885] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:57:05,902] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:57:05,962] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:57:05,967] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:57:05,997] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:57:06,005] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:57:06,008] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:57:06,011] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:57:06,016] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:57:06,041] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:57:06,048] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:57:06,055] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:57:06,076] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:57:06,078] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:57:06,108] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:57:06,119] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:57:06,122] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:57:06,280] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:57:06,346] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:57:06,401] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:57:06,418] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:57:06,442] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:57:06,456] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:57:06,457] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:57:06,461] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:57:06,468] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:57:06,480] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:57:06,504] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:57:06,506] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:57:06,510] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:57:06,519] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:57:13,815] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:57:13,817] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:57:14,049] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:57:14,097] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:57:14,123] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:57:14,212] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:57:14,298] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:57:14,326] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:57:14,331] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:57:14,339] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:57:14,424] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:57:14,502] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:57:14,508] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:57:14,571] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:57:14,570] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:57:14,576] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:57:14,581] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:57:14,593] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:57:14,603] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:57:14,603] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:57:14,635] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:57:14,657] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:57:14,659] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:57:14,667] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:57:14,690] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:57:14,754] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:57:14,768] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:57:14,800] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:57:14,820] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:57:14,894] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:57:15,006] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:57:15,054] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:57:15,358] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:57:15,377] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:57:15,407] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:57:15,465] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:57:15,467] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:57:15,467] [INFO] [comm.py:706:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
[2026-04-13 22:57:15,484] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:57:15,485] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:57:15,501] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:57:15,805] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:57:15,845] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:57:15,845] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:57:15,854] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:57:15,906] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:57:15,910] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:57:15,950] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:57:15,955] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:57:15,979] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:57:15,979] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:57:15,981] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:57:15,981] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:57:15,986] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:57:15,989] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:57:15,992] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:57:16,003] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:57:16,004] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:57:16,009] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:57:16,023] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:57:16,044] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:57:16,050] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:57:16,054] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:57:22,654] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:57:22,662] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:57:22,662] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:57:22,677] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:57:22,680] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:57:22,683] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:57:22,684] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:57:22,684] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:57:22,690] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:57:22,694] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:57:22,710] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:57:22,750] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:57:22,751] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:57:22,753] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:57:22,757] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:57:22,769] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:57:22,775] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:57:25,930] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:57:25,930] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:57:25,930] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:57:25,930] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:57:25,930] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:57:25,930] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:57:25,930] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:57:25,930] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:57:26,065] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:57:26,065] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:57:26,065] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:57:26,065] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:57:26,065] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:57:26,065] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:57:26,065] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:57:26,065] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:57:26,534] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:57:26,534] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:57:26,534] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:57:26,534] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:57:26,534] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:57:26,534] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:57:26,534] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:57:26,535] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:57:26,700] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:57:37,768] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:57:37,768] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:57:37,768] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:57:37,768] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:57:37,768] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:57:37,768] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:57:37,768] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:57:37,769] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:57:37,796] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:57:37,796] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:57:37,796] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:57:37,796] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:57:37,796] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:57:37,796] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:57:37,796] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:57:37,796] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:57:38,359] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:57:38,359] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:57:38,359] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:57:38,359] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:57:38,359] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:57:38,359] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:57:38,359] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:57:38,360] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:57:40,194] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:57:40,194] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:57:40,194] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:57:40,194] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:57:40,194] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:57:40,215] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:57:40,215] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:57:40,215] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:57:40,215] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:57:40,215] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:57:40,215] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:57:40,216] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:57:40,363] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:57:40,363] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:57:40,363] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:57:40,363] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:57:40,364] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:57:47,001] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:57:47,049] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:57:47,061] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:57:47,091] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:57:47,098] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:57:47,098] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:57:47,105] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:57:50,876] [INFO] [partition_parameters.py:348:__exit__] finished initializing model - num_params = 399, num_elems = 8.19B
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-1089
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-3035
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']
args.report_to: ['wandb']
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-1062
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-1089
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-13_batch-block1-3544
Using Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-1062
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-15_batch-block1-3736
Using Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']
Using Prefix Tree collator
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-3544
Using Prefix Tree collator
args.report_to: ['wandb']
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-15_batch-block1-3736
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-13_batch-block1-3544
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-15_batch-block1-3736
args.report_to: ['wandb']
Using Prefix Tree collator
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-3035
args.report_to: ['wandb']
Using Prefix Tree collator
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-15_batch-block1-3736
args.report_to: ['wandb']
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-1062
Using Prefix Tree collator
args.report_to: ['wandb']
Using Prefix Tree collator
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-1089
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-1089
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-1089
Using Prefix Tree collator
args.report_to: ['wandb']
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-1062
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-3544
args.report_to: ['wandb']
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-3544
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-1062
Using Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-15_batch-block1-1062
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-15_batch-block1-3736
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-3035
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-1089
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']
args.report_to: ['wandb']
Using Prefix Tree collator
Using Prefix Tree collator
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-1062
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-3544
Using Prefix Tree collator
args.report_to: ['wandb']Using Prefix Tree collator
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-3035
Using Prefix Tree collator
args.report_to: ['wandb']
args.report_to: ['wandb']
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-3035
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-3035
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-1089
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-15_batch-block1-3736
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-1089
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-3544
Using Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-3544
args.report_to: ['wandb']
Using Prefix Tree collatorUsing Prefix Tree collator
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-15_batch-block1-1062
args.report_to: ['wandb']
args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-3035
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-14_batch-block1-3035
Using Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-15_batch-block1-3736
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-15_batch-block1-3736
Using Prefix Tree collator
Using Prefix Tree collatorUsing Prefix Tree collator
Using Prefix Tree collatorUsing Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-37_batch-block1-3163
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-37_batch-block1-3163
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-37_batch-block1-3163
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-37_batch-block1-3163args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-37_batch-block1-3163
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-37_batch-block1-3163
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-37_batch-block1-3163
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-37_batch-block1-3163
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collatorUsing Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-38_batch-block1-3305
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-38_batch-block1-3305
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-38_batch-block1-3305args.report_to: ['wandb']
args.report_to: ['wandb']
args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-38_batch-block1-3305
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-38_batch-block1-3305
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-38_batch-block1-3305
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-38_batch-block1-3305
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-38_batch-block1-3305
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collatorUsing Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-37_batch-block1-3038
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-37_batch-block1-3038
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-37_batch-block1-3038
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-37_batch-block1-3038
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-37_batch-block1-3038
args.report_to: ['wandb']args.report_to: ['wandb']
args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-37_batch-block1-3038
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-37_batch-block1-3038
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-57-37_batch-block1-3038
Parameter Offload: Total persistent parameters: 308224 in 145 params
{'train_runtime': 4.1282, 'train_samples_per_second': 1868.144, 'train_steps_per_second': 15.503, 'train_loss': 0.0, 'epoch': 8.0}
wandb:
wandb: 🚀 View run runs/dev/tw-8node-resume at: https://wandb.ai/ligeng-zhu/ThreadWeaver/runs/tw-8node-resume

594
slurm/9168628.0.err Normal file

File diff suppressed because one or more lines are too long

630
slurm/9168628.0.out Normal file
View File

@@ -0,0 +1,630 @@
SLURM_JOB_ID = 9168628
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
RUN_NAME = tw-8node-resume
OUTPUT_DIR = runs/dev/tw-8node-resume
NNODES = 8
NODES = batch-block1-3061 batch-block1-3736 batch-block1-3305 batch-block1-3544 batch-block1-1089 batch-block1-3163 batch-block1-3038 batch-block1-3035
NODE_RANK = 0
GPUS_PER_NODE = 8
MASTER_ADDR = batch-block1-3061
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE =
GRADIENT_ACCUMULATION_STEPS =
PER_DEVICE_TRAIN_BATCH_SIZE =
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
SLURM_JOB_ID = 9168628
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
RUN_NAME = tw-8node-resume
OUTPUT_DIR = runs/dev/tw-8node-resume
NNODES = 8
SLURM_JOB_ID = 9168628
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
RUN_NAME = tw-8node-resume
OUTPUT_DIR = runs/dev/tw-8node-resume
NNODES = 8
SLURM_JOB_ID = 9168628
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
RUN_NAME = tw-8node-resume
OUTPUT_DIR = runs/dev/tw-8node-resume
NNODES = 8
SLURM_JOB_ID = 9168628
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
RUN_NAME = tw-8node-resume
OUTPUT_DIR = runs/dev/tw-8node-resume
NNODES = 8
SLURM_JOB_ID = 9168628
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
RUN_NAME = tw-8node-resume
OUTPUT_DIR = runs/dev/tw-8node-resume
NNODES = 8
NODES = batch-block1-3061 batch-block1-3736 batch-block1-3305 batch-block1-3544 batch-block1-1089 batch-block1-3163 batch-block1-3038 batch-block1-3035
NODE_RANK = 7
GPUS_PER_NODE = 8
NODES = batch-block1-3061 batch-block1-3736 batch-block1-3305 batch-block1-3544 batch-block1-1089 batch-block1-3163 batch-block1-3038 batch-block1-3035
SLURM_JOB_ID = 9168628
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
NODE_RANK = 2
GPUS_PER_NODE = 8
RUN_NAME = tw-8node-resume
OUTPUT_DIR = runs/dev/tw-8node-resume
NNODES = 8
NODES = batch-block1-3061 batch-block1-3736 batch-block1-3305 batch-block1-3544 batch-block1-1089 batch-block1-3163 batch-block1-3038 batch-block1-3035
NODE_RANK = 6
GPUS_PER_NODE = 8
NODES = batch-block1-3061 batch-block1-3736 batch-block1-3305 batch-block1-3544 batch-block1-1089 batch-block1-3163 batch-block1-3038 batch-block1-3035
NODE_RANK = 3
GPUS_PER_NODE = 8
NODES = batch-block1-3061 batch-block1-3736 batch-block1-3305 batch-block1-3544 batch-block1-1089 batch-block1-3163 batch-block1-3038 batch-block1-3035
NODE_RANK = 4
GPUS_PER_NODE = 8
MASTER_ADDR = batch-block1-3061
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE =
GRADIENT_ACCUMULATION_STEPS =
PER_DEVICE_TRAIN_BATCH_SIZE =
MASTER_ADDR = batch-block1-3061
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE =
GRADIENT_ACCUMULATION_STEPS =
PER_DEVICE_TRAIN_BATCH_SIZE =
MASTER_ADDR = batch-block1-3061
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE =
GRADIENT_ACCUMULATION_STEPS =
PER_DEVICE_TRAIN_BATCH_SIZE =
NODES = batch-block1-3061 batch-block1-3736 batch-block1-3305 batch-block1-3544 batch-block1-1089 batch-block1-3163 batch-block1-3038 batch-block1-3035
NODE_RANK = 5
GPUS_PER_NODE = 8
MASTER_ADDR = batch-block1-3061
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE =
GRADIENT_ACCUMULATION_STEPS =
PER_DEVICE_TRAIN_BATCH_SIZE =
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
MASTER_ADDR = batch-block1-3061
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE =
GRADIENT_ACCUMULATION_STEPS =
PER_DEVICE_TRAIN_BATCH_SIZE =
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
MASTER_ADDR = batch-block1-3061
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE =
GRADIENT_ACCUMULATION_STEPS =
PER_DEVICE_TRAIN_BATCH_SIZE =
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
SLURM_JOB_ID = 9168628
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
RUN_NAME = tw-8node-resume
OUTPUT_DIR = runs/dev/tw-8node-resume
NNODES = 8
NODES = batch-block1-3061 batch-block1-3736 batch-block1-3305 batch-block1-3544 batch-block1-1089 batch-block1-3163 batch-block1-3038 batch-block1-3035
NODE_RANK = 1
GPUS_PER_NODE = 8
MASTER_ADDR = batch-block1-3061
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE =
GRADIENT_ACCUMULATION_STEPS =
PER_DEVICE_TRAIN_BATCH_SIZE =
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1Imported prefix tree collator v1
Imported prefix tree collator v1Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
[2026-04-13 22:59:31,354] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:59:31,508] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:59:31,522] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:59:31,534] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:59:31,537] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:59:31,571] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:59:31,609] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:59:31,624] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:59:31,732] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:59:31,735] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:59:31,750] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:59:31,765] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:59:31,766] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:59:31,812] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:59:31,824] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:59:31,830] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:59:31,843] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:59:31,848] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:59:31,851] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:59:31,858] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:59:31,863] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:59:31,870] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:59:31,870] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:59:31,880] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:59:31,882] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:59:31,886] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:59:31,888] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:59:31,890] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:59:31,892] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:59:31,894] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:59:31,894] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:59:31,896] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:59:31,900] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:59:31,904] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:59:31,954] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:59:31,956] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:59:31,962] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:59:31,968] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:59:31,970] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:59:31,978] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:59:32,119] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:59:32,126] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:59:32,136] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:59:32,172] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:59:32,178] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:59:32,183] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:59:32,197] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:59:32,205] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:59:32,207] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:59:32,210] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:59:32,211] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:59:32,295] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:59:32,309] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:59:32,309] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:59:32,311] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:59:32,314] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:59:39,924] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:59:40,048] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:59:40,160] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:59:40,170] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:59:40,319] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:59:40,344] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:59:40,387] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:59:40,400] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:59:40,426] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:59:40,432] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:59:40,447] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:59:40,478] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:59:40,482] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:59:40,490] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:59:40,508] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:59:40,545] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:59:40,551] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:59:40,569] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:59:40,571] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:59:40,577] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:59:40,638] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:59:40,638] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:59:40,657] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:59:40,666] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:59:40,671] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:59:40,677] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:59:40,681] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:59:40,682] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:59:40,715] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:59:40,751] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:59:40,753] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:59:40,756] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:59:40,778] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:59:40,812] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:59:40,826] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:59:40,888] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:59:40,912] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:59:40,934] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:59:40,938] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:59:40,941] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:59:40,952] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:59:40,984] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:59:40,989] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:59:41,068] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:59:41,107] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:59:41,124] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:59:41,126] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:59:41,129] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:59:41,133] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:59:41,138] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:59:41,345] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:59:41,463] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:59:41,478] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:59:41,507] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:59:41,508] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:59:41,517] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 22:59:42,784] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:59:42,797] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:59:42,811] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:59:42,813] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:59:42,824] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:59:42,827] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:59:42,828] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:59:42,842] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:59:42,883] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:59:42,935] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:59:42,947] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:59:42,949] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:59:42,954] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:59:42,962] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:59:42,962] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:59:42,968] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:59:42,971] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:59:47,499] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:59:47,501] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:59:47,501] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:59:47,540] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:59:47,553] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:59:47,561] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:59:47,562] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:59:47,587] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:59:47,590] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:59:47,594] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:59:47,595] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:59:47,597] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:59:47,599] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:59:47,610] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:59:47,612] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:59:47,613] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:59:47,696] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:59:52,018] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:59:52,018] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:59:52,018] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:59:52,018] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:59:52,018] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:59:52,018] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:59:52,018] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:59:52,018] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 22:59:52,424] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:59:52,454] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:59:52,470] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:59:52,483] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:59:52,483] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:59:52,486] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:59:52,504] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:59:52,505] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:59:52,509] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:59:52,510] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:59:52,513] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:59:52,538] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:59:52,540] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:59:52,541] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:59:52,541] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:59:52,542] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:59:52,545] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:59:52,550] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:59:52,551] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:59:52,570] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:59:52,572] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 22:59:52,575] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:00:03,732] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:00:03,732] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:00:03,732] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:00:03,732] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:00:03,732] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:00:03,732] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:00:03,732] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:00:03,732] [INFO] [comm.py:706:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
[2026-04-13 23:00:03,732] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:00:05,419] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:00:05,419] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:00:05,419] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:00:05,419] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:00:05,419] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:00:05,419] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:00:05,419] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:00:05,419] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:00:10,135] [INFO] [partition_parameters.py:348:__exit__] finished initializing model - num_params = 399, num_elems = 8.19B
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-41_batch-block1-3736
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-1089
Using Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-1089
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3038
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-39_batch-block1-3544
Using Prefix Tree collator
args.report_to: ['wandb']
Using Prefix Tree collator
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3544
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3305
args.report_to: ['wandb']
Using Prefix Tree collator
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3038
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3035
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3163
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3038
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3038
args.report_to: ['wandb']
args.report_to: ['wandb']
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3163
args.report_to: ['wandb']
Using Prefix Tree collator
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3038
Using Prefix Tree collator
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-41_batch-block1-3736
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3035
Using Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3038
Using Prefix Tree collator
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-1089
Using Prefix Tree collator
args.report_to: ['wandb']
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-41_batch-block1-3305
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3544
args.report_to: ['wandb']
Using Prefix Tree collator
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3163
args.report_to: ['wandb']
Using Prefix Tree collator
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3305
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-41_batch-block1-3305
Using Prefix Tree collator
args.report_to: ['wandb']
Using Prefix Tree collator
Using Prefix Tree collator
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-41_batch-block1-3736
args.report_to: ['wandb']
args.report_to: ['wandb']
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-1089
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3035
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3038
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']
Using Prefix Tree collator
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3038
args.report_to: ['wandb']
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-41_batch-block1-3736
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3544
Using Prefix Tree collator
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3163
args.report_to: ['wandb']
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-1089
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-41_batch-block1-3736
args.report_to: ['wandb']
Using Prefix Tree collatorUsing Prefix Tree collator
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-1089
Using Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']
Using Prefix Tree collatorUsing Prefix Tree collator
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3163
Using Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3163
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3163
Using Prefix Tree collator
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3544
args.report_to: ['wandb']
args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3544
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3544
args.report_to: ['wandb']
args.report_to: ['wandb']
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-41_batch-block1-3736
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3035
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3163
Using Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']
Using Prefix Tree collator
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-41_batch-block1-3736
args.report_to: ['wandb']
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3035
Using Prefix Tree collator
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-41_batch-block1-1089
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-41_batch-block1-3736
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3544
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3305
Using Prefix Tree collatorUsing Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']
Using Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-41_batch-block1-3305
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-41_batch-block1-3305
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3035
args.report_to: ['wandb']
args.report_to: ['wandb']
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3035
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-1089
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3305
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_22-59-40_batch-block1-3035
Using Prefix Tree collatorUsing Prefix Tree collatorUsing Prefix Tree collatorUsing Prefix Tree collator
Using Prefix Tree collatorUsing Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']args.report_to: ['wandb']args.report_to: ['wandb']
args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-00-03_batch-block1-3061args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-00-03_batch-block1-3061args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-00-03_batch-block1-3061
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-00-03_batch-block1-3061
args.report_to: ['wandb']args.report_to: ['wandb']
args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-00-03_batch-block1-3061args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-00-03_batch-block1-3061
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-00-03_batch-block1-3061
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-00-03_batch-block1-3061
Parameter Offload: Total persistent parameters: 308224 in 145 params
{'train_runtime': 3.8014, 'train_samples_per_second': 2028.721, 'train_steps_per_second': 16.836, 'train_loss': 0.0, 'epoch': 8.0}
wandb:
wandb: 🚀 View run runs/dev/tw-8node-resume at: https://wandb.ai/ligeng-zhu/ThreadWeaver/runs/tw-8node-resume

594
slurm/9168633.0.err Normal file

File diff suppressed because one or more lines are too long

630
slurm/9168633.0.out Normal file
View File

@@ -0,0 +1,630 @@
SLURM_JOB_ID = 9168633
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
RUN_NAME = tw-8node-resume
OUTPUT_DIR = runs/dev/tw-8node-resume
NNODES = 8
NODES = batch-block1-1015 batch-block1-3297 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3908 batch-block1-3227
NODE_RANK = 5
GPUS_PER_NODE = 8
SLURM_JOB_ID = 9168633
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
RUN_NAME = tw-8node-resume
OUTPUT_DIR = runs/dev/tw-8node-resume
NNODES = 8
MASTER_ADDR = batch-block1-1015
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE =
GRADIENT_ACCUMULATION_STEPS =
PER_DEVICE_TRAIN_BATCH_SIZE =
SLURM_JOB_ID = 9168633
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
RUN_NAME = tw-8node-resume
OUTPUT_DIR = runs/dev/tw-8node-resume
NNODES = 8
SLURM_JOB_ID = 9168633
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
RUN_NAME = tw-8node-resume
OUTPUT_DIR = runs/dev/tw-8node-resume
NNODES = 8
NODES = batch-block1-1015 batch-block1-3297 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3908 batch-block1-3227
NODE_RANK = 3
GPUS_PER_NODE = 8
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
NODES = batch-block1-1015 batch-block1-3297 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3908 batch-block1-3227
NODE_RANK = 7
GPUS_PER_NODE = 8
NODES = batch-block1-1015 batch-block1-3297 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3908 batch-block1-3227
NODE_RANK = 6
GPUS_PER_NODE = 8
MASTER_ADDR = batch-block1-1015
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE =
GRADIENT_ACCUMULATION_STEPS =
PER_DEVICE_TRAIN_BATCH_SIZE =
MASTER_ADDR = batch-block1-1015
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE =
GRADIENT_ACCUMULATION_STEPS =
PER_DEVICE_TRAIN_BATCH_SIZE =
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
MASTER_ADDR = batch-block1-1015
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE =
GRADIENT_ACCUMULATION_STEPS =
PER_DEVICE_TRAIN_BATCH_SIZE =
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
SLURM_JOB_ID = 9168633
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
RUN_NAME = tw-8node-resume
OUTPUT_DIR = runs/dev/tw-8node-resume
NNODES = 8
NODES = batch-block1-1015 batch-block1-3297 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3908 batch-block1-3227
NODE_RANK = 2
GPUS_PER_NODE = 8
SLURM_JOB_ID = 9168633
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
RUN_NAME = tw-8node-resume
OUTPUT_DIR = runs/dev/tw-8node-resume
NNODES = 8
MASTER_ADDR = batch-block1-1015
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE =
GRADIENT_ACCUMULATION_STEPS =
PER_DEVICE_TRAIN_BATCH_SIZE =
NODES = batch-block1-1015 batch-block1-3297 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3908 batch-block1-3227
NODE_RANK = 1
GPUS_PER_NODE = 8
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
SLURM_JOB_ID = 9168633
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
RUN_NAME = tw-8node-resume
OUTPUT_DIR = runs/dev/tw-8node-resume
NNODES = 8
MASTER_ADDR = batch-block1-1015
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE =
GRADIENT_ACCUMULATION_STEPS =
PER_DEVICE_TRAIN_BATCH_SIZE =
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
NODES = batch-block1-1015 batch-block1-3297 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3908 batch-block1-3227
NODE_RANK = 0
GPUS_PER_NODE = 8
MASTER_ADDR = batch-block1-1015
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE =
GRADIENT_ACCUMULATION_STEPS =
PER_DEVICE_TRAIN_BATCH_SIZE =
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
SLURM_JOB_ID = 9168633
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
RUN_NAME = tw-8node-resume
OUTPUT_DIR = runs/dev/tw-8node-resume
NNODES = 8
NODES = batch-block1-1015 batch-block1-3297 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3908 batch-block1-3227
NODE_RANK = 4
GPUS_PER_NODE = 8
MASTER_ADDR = batch-block1-1015
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE =
GRADIENT_ACCUMULATION_STEPS =
PER_DEVICE_TRAIN_BATCH_SIZE =
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1Imported prefix tree collator v1
Imported prefix tree collator v1Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
[2026-04-13 23:03:58,482] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:03:58,482] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:03:58,609] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:03:58,667] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:03:58,672] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:03:58,672] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:03:58,672] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:03:58,672] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:04:06,850] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:04:06,866] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:04:07,134] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:04:07,203] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:04:07,325] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:04:07,329] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:04:07,336] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:04:07,365] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:04:08,644] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:04:08,757] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:04:08,759] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:04:08,761] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:04:08,761] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:04:08,766] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:04:08,769] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:04:08,769] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:04:17,775] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:04:17,775] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:04:17,775] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:04:17,775] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:04:17,775] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:04:17,775] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:04:17,775] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:04:17,775] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:04:17,775] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:04:17,775] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:04:17,775] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:04:17,775] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:04:17,775] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:04:17,775] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:04:17,775] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:04:17,776] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:04:18,422] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:04:18,422] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:04:18,422] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:04:18,422] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:04:18,422] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:04:18,422] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:04:18,422] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:04:18,423] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:04:18,531] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:04:18,531] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:04:18,531] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:04:18,531] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:04:18,531] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:04:18,531] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:04:18,531] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:04:18,531] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:04:18,582] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:04:18,582] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:04:18,582] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:04:18,582] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:04:18,582] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:04:18,582] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:04:18,582] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:04:18,582] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:04:18,773] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:04:18,773] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:04:18,773] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:04:18,773] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:04:18,773] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:04:18,773] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:04:18,773] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:04:18,774] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:04:19,192] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:04:19,192] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:04:19,192] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:04:19,192] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:04:19,192] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:04:19,192] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:04:19,192] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:04:19,193] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:04:29,337] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:04:29,337] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:04:29,337] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:04:29,337] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:04:29,337] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:04:29,337] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:04:29,337] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:04:29,337] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:04:29,376] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:04:29,376] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:04:29,376] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:04:29,376] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:04:29,376] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:04:29,376] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:04:29,376] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:04:29,376] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:04:30,218] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:04:30,218] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:04:30,218] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:04:30,218] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:04:30,218] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:04:30,218] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:04:30,218] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:04:30,218] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:04:30,223] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:04:30,223] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:04:30,223] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:04:30,223] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:04:30,223] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:04:30,224] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:04:30,224] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:04:30,224] [INFO] [comm.py:706:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
[2026-04-13 23:04:30,224] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:04:30,348] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:04:30,348] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:04:30,348] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:04:30,348] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:04:30,349] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:04:30,349] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:04:30,349] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:04:30,349] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:04:30,351] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:04:30,351] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:04:30,351] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:04:30,351] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:04:30,351] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:04:30,351] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:04:30,351] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:04:30,351] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:04:31,121] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:04:31,122] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:04:31,122] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:04:31,172] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:04:31,172] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:04:31,172] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:04:31,172] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:04:31,172] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:04:31,210] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:04:31,224] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:04:31,224] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:04:31,224] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:04:31,224] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:04:31,224] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:04:31,224] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:04:31,224] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:04:31,224] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:04:31,227] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:04:31,246] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:04:31,247] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:04:31,358] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:04:31,358] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:04:31,358] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:04:31,359] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:04:31,893] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:04:31,893] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:04:31,893] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:04:31,893] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:04:31,893] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:04:31,894] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:04:31,894] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:04:31,895] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:04:36,779] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:04:36,822] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:04:36,822] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:04:36,850] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:04:36,862] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:04:36,874] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:04:36,896] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:04:36,900] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:04:36,906] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:04:36,908] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:04:36,911] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:04:36,914] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:04:36,923] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:04:36,932] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:04:36,932] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:04:36,953] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:04:36,955] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:04:36,961] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:04:36,969] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:04:36,969] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:04:36,969] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:04:36,974] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:04:36,976] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:04:36,991] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:04:37,042] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:04:37,043] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:04:37,075] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:04:37,097] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:04:37,103] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:04:37,104] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:04:37,117] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:04:37,195] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:04:41,568] [INFO] [partition_parameters.py:348:__exit__] finished initializing model - num_params = 399, num_elems = 8.19B
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-07_batch-block1-3273
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-07_batch-block1-3273
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-07_batch-block1-3273
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-06_batch-block1-3273
Using Prefix Tree collatorUsing Prefix Tree collator
args.report_to: ['wandb']args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-06_batch-block1-3273args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-07_batch-block1-3273
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-07_batch-block1-3273
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-07_batch-block1-3273
Using Prefix Tree collator
Using Prefix Tree collatorUsing Prefix Tree collator
Using Prefix Tree collatorUsing Prefix Tree collator
Using Prefix Tree collatorUsing Prefix Tree collatorUsing Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-3908
args.report_to: ['wandb']args.report_to: ['wandb']args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-3908args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-3908args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-3908
Using Prefix Tree collator
Using Prefix Tree collatorUsing Prefix Tree collatorUsing Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-3908
args.report_to: ['wandb']
args.report_to: ['wandb']args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-3908
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-3908args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-3908
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1027
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1027
args.report_to: ['wandb']args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1027args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1027
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1027
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1027
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1027
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1027
Using Prefix Tree collatorUsing Prefix Tree collatorUsing Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collatorUsing Prefix Tree collatorUsing Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-29_batch-block1-3227args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-29_batch-block1-3227
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-29_batch-block1-3227
args.report_to: ['wandb']args.report_to: ['wandb']args.report_to: ['wandb']
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-29_batch-block1-3227args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-29_batch-block1-3227args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-29_batch-block1-3227
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-29_batch-block1-3227
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-29_batch-block1-3227
Using Prefix Tree collatorUsing Prefix Tree collator
Using Prefix Tree collatorUsing Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']
args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1015
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1015
args.report_to: ['wandb']args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1015args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1015
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1015
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1015
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1015
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1015
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1134
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1134
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1134
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1134
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1134
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1134
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1134
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-30_batch-block1-1134
Using Prefix Tree collator
Using Prefix Tree collatorUsing Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collatorUsing Prefix Tree collator
Using Prefix Tree collatorUsing Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-31_batch-block1-3848args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-31_batch-block1-3848
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-31_batch-block1-3848
args.report_to: ['wandb']args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-31_batch-block1-3848
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-31_batch-block1-3848
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-31_batch-block1-3848
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-31_batch-block1-3848args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-31_batch-block1-3848
Using Prefix Tree collator
Using Prefix Tree collatorUsing Prefix Tree collatorUsing Prefix Tree collatorUsing Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collatorUsing Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-29_batch-block1-3297
args.report_to: ['wandb']args.report_to: ['wandb']
args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-29_batch-block1-3297args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-29_batch-block1-3297args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-29_batch-block1-3297
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-29_batch-block1-3297
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-29_batch-block1-3297
args.report_to: ['wandb']args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-29_batch-block1-3297args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-04-29_batch-block1-3297
Parameter Offload: Total persistent parameters: 308224 in 145 params
{'train_runtime': 3.8131, 'train_samples_per_second': 2022.522, 'train_steps_per_second': 16.784, 'train_loss': 0.0, 'epoch': 8.0}
wandb:
wandb: 🚀 View run runs/dev/tw-8node-resume at: https://wandb.ai/ligeng-zhu/ThreadWeaver/runs/tw-8node-resume

594
slurm/9168640.0.err Normal file

File diff suppressed because one or more lines are too long

630
slurm/9168640.0.out Normal file
View File

@@ -0,0 +1,630 @@
SLURM_JOB_ID = 9168640
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
RUN_NAME = tw-8node-resume
OUTPUT_DIR = runs/dev/tw-8node-resume
NNODES = 8
SLURM_JOB_ID = 9168640
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
RUN_NAME = tw-8node-resume
OUTPUT_DIR = runs/dev/tw-8node-resume
NNODES = 8
SLURM_JOB_ID = 9168640
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
RUN_NAME = tw-8node-resume
OUTPUT_DIR = runs/dev/tw-8node-resume
NNODES = 8
SLURM_JOB_ID = 9168640
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
RUN_NAME = tw-8node-resume
OUTPUT_DIR = runs/dev/tw-8node-resume
NNODES = 8
SLURM_JOB_ID = 9168640
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
RUN_NAME = tw-8node-resume
OUTPUT_DIR = runs/dev/tw-8node-resume
NNODES = 8
SLURM_JOB_ID = 9168640
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
RUN_NAME = tw-8node-resume
OUTPUT_DIR = runs/dev/tw-8node-resume
NNODES = 8
NODES = batch-block1-1015 batch-block1-3297 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3908 batch-block1-3227
NODE_RANK = 3
GPUS_PER_NODE = 8
NODES = batch-block1-1015 batch-block1-3297 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3908 batch-block1-3227
NODE_RANK = 7
GPUS_PER_NODE = 8
NODES = batch-block1-1015 batch-block1-3297 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3908 batch-block1-3227
NODE_RANK = 4
GPUS_PER_NODE = 8
SLURM_JOB_ID = 9168640
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
RUN_NAME = tw-8node-resume
OUTPUT_DIR = runs/dev/tw-8node-resume
NNODES = 8
NODES = batch-block1-1015 batch-block1-3297 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3908 batch-block1-3227
NODE_RANK = 6
GPUS_PER_NODE = 8
NODES = batch-block1-1015 batch-block1-3297 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3908 batch-block1-3227
NODE_RANK = 5
GPUS_PER_NODE = 8
NODES = batch-block1-1015 batch-block1-3297 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3908 batch-block1-3227
NODE_RANK = 1
GPUS_PER_NODE = 8
MASTER_ADDR = batch-block1-1015
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE =
GRADIENT_ACCUMULATION_STEPS =
PER_DEVICE_TRAIN_BATCH_SIZE =
MASTER_ADDR = batch-block1-1015
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE =
GRADIENT_ACCUMULATION_STEPS =
PER_DEVICE_TRAIN_BATCH_SIZE =
MASTER_ADDR = batch-block1-1015
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE =
GRADIENT_ACCUMULATION_STEPS =
PER_DEVICE_TRAIN_BATCH_SIZE =
NODES = batch-block1-1015 batch-block1-3297 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3908 batch-block1-3227
NODE_RANK = 2
GPUS_PER_NODE = 8
MASTER_ADDR = batch-block1-1015
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE =
GRADIENT_ACCUMULATION_STEPS =
PER_DEVICE_TRAIN_BATCH_SIZE =
MASTER_ADDR = batch-block1-1015
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE =
GRADIENT_ACCUMULATION_STEPS =
PER_DEVICE_TRAIN_BATCH_SIZE =
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
MASTER_ADDR = batch-block1-1015
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE =
GRADIENT_ACCUMULATION_STEPS =
PER_DEVICE_TRAIN_BATCH_SIZE =
MASTER_ADDR = batch-block1-1015
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE =
GRADIENT_ACCUMULATION_STEPS =
PER_DEVICE_TRAIN_BATCH_SIZE =
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
SLURM_JOB_ID = 9168640
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
RUN_NAME = tw-8node-resume
OUTPUT_DIR = runs/dev/tw-8node-resume
NNODES = 8
NODES = batch-block1-1015 batch-block1-3297 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3908 batch-block1-3227
NODE_RANK = 0
GPUS_PER_NODE = 8
MASTER_ADDR = batch-block1-1015
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE =
GRADIENT_ACCUMULATION_STEPS =
PER_DEVICE_TRAIN_BATCH_SIZE =
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
[2026-04-13 23:06:06,690] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:06:06,691] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:06:06,738] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:06:06,739] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:06:06,745] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:06:06,746] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:06:06,753] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:06:06,753] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:06:06,806] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:06:06,811] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:06:06,827] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:06:06,828] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:06:06,837] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:06:06,837] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:06:06,840] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:06:06,843] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:06:06,844] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:06:06,849] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:06:06,849] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:06:06,851] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:06:06,853] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:06:06,860] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:06:06,865] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:06:06,870] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:06:06,880] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:06:06,890] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:06:06,892] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:06:06,895] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:06:06,896] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:06:06,898] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:06:06,898] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:06:06,919] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:06:06,927] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:06:06,927] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:06:06,932] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:06:06,931] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:06:06,944] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:06:06,953] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:06:06,960] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:06:06,963] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:06:06,968] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:06:06,973] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:06:06,974] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:06:06,976] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:06:06,991] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:06:07,006] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:06:07,009] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:06:07,010] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:06:07,010] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:06:07,010] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:06:07,016] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:06:07,036] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:06:07,037] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:06:07,039] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:06:07,040] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:06:07,040] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:06:07,041] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:06:07,077] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:06:07,077] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:06:07,081] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:06:07,104] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:06:07,107] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:06:07,109] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:06:07,124] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:06:14,863] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:06:14,928] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:06:14,998] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:06:15,030] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:06:15,096] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:06:15,122] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:06:15,201] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:06:15,223] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:06:15,228] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:06:15,260] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:06:15,282] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:06:15,287] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:06:15,300] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:06:15,314] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:06:15,331] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:06:15,339] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:06:15,344] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:06:15,389] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:06:15,404] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:06:15,432] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:06:15,438] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:06:15,484] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:06:15,522] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:06:15,559] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:06:15,610] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:06:15,657] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:06:15,670] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:06:15,676] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:06:15,809] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:06:15,823] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:06:15,887] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:06:15,912] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:06:15,919] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:06:15,931] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:06:15,935] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:06:15,963] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:06:15,974] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:06:16,071] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:06:16,073] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:06:16,074] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:06:16,102] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:06:16,116] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:06:16,116] [INFO] [comm.py:706:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
[2026-04-13 23:06:16,209] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:06:16,210] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:06:16,230] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:06:16,307] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:06:16,315] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:06:16,317] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:06:16,373] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:06:16,380] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:06:16,381] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:06:16,386] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:06:16,406] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:06:16,420] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:06:16,463] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:06:16,464] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:06:16,467] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:06:16,573] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:06:16,578] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:06:16,583] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:06:16,613] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:06:16,613] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:06:16,620] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:06:16,627] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:06:16,628] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:06:16,631] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:06:16,648] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:06:16,667] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:06:16,669] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:06:16,689] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:06:16,696] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:06:16,720] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:06:16,723] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:06:16,724] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:06:16,750] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:06:16,763] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:06:16,778] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:06:16,800] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:06:16,827] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:06:16,839] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:06:16,844] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:06:16,858] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:06:16,886] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:06:16,892] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:06:16,893] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:06:16,896] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:06:16,898] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:06:16,904] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:06:16,910] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:06:16,911] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:06:16,913] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:06:16,922] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:06:16,951] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:06:16,952] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:06:16,954] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:06:17,285] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:06:17,331] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:06:17,427] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:06:17,444] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:06:17,446] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:06:17,456] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:06:17,464] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:06:23,267] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:06:23,284] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:06:23,306] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:06:23,330] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:06:23,359] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:06:23,360] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:06:23,367] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:06:23,373] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:06:23,374] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:06:23,383] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:06:23,373] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:06:23,446] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:06:23,509] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:06:23,510] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:06:23,520] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:06:23,537] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:06:27,059] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:06:27,080] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:06:27,101] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:06:27,149] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:06:27,154] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:06:27,158] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:06:27,171] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:06:27,171] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:06:27,194] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:06:27,195] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:06:31,119] [INFO] [partition_parameters.py:348:__exit__] finished initializing model - num_params = 399, num_elems = 8.19B
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-3297
Using Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-1134
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-1134
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-3848
Using Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-1134
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-3848
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-3848
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-14_batch-block1-1027
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-1027
args.report_to: ['wandb']
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-1134
args.report_to: ['wandb']
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-3227
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-1015
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-3273
args.report_to: ['wandb']
args.report_to: ['wandb']
args.report_to: ['wandb']
Using Prefix Tree collator
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-3848
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-3227
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-3297
Using Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-3273
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-3273
args.report_to: ['wandb']
Using Prefix Tree collator
args.report_to: ['wandb']
Using Prefix Tree collator
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-3297
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-3227
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-1027
Using Prefix Tree collator
args.report_to: ['wandb']
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-3848
Using Prefix Tree collator
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-1027
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-1015
Using Prefix Tree collator
args.report_to: ['wandb']
Using Prefix Tree collator
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-1134
Using Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']
args.report_to: ['wandb']
Using Prefix Tree collator
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-3297
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-3273
Using Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-3848
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-3227
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-3848
args.report_to: ['wandb']
args.report_to: ['wandb']
args.report_to: ['wandb']
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-3273
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-3227
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-1015
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-3908
Using Prefix Tree collator
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-3297
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collatorargs.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-3908
args.report_to: ['wandb']
Using Prefix Tree collator
args.report_to: ['wandb']
Using Prefix Tree collator
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-1134
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-3908
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-1015
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-3908
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-3908
Using Prefix Tree collator
args.report_to: ['wandb']
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-3273
args.report_to: ['wandb']
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-1134
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-1015
args.report_to: ['wandb']
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-1015
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-3227
args.report_to: ['wandb']
Using Prefix Tree collator
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-3273
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-1027
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']
Using Prefix Tree collator
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-3297
args.report_to: ['wandb']
args.report_to: ['wandb']
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-3848
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-1027
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-1015
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-3908
Using Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-1027
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-1015
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-3297
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-14_batch-block1-1027
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-3908
Using Prefix Tree collator
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-3273
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-3227
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-3227
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-15_batch-block1-3297
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-16_batch-block1-3908
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-06-14_batch-block1-1134
Parameter Offload: Total persistent parameters: 308224 in 145 params
{'train_runtime': 3.1226, 'train_samples_per_second': 2469.703, 'train_steps_per_second': 20.495, 'train_loss': 0.0, 'epoch': 8.0}
wandb:
wandb: 🚀 View run runs/dev/tw-8node-resume at: https://wandb.ai/ligeng-zhu/ThreadWeaver/runs/tw-8node-resume

594
slurm/9168643.0.err Normal file

File diff suppressed because one or more lines are too long

630
slurm/9168643.0.out Normal file
View File

@@ -0,0 +1,630 @@
SLURM_JOB_ID = 9168643
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
RUN_NAME = tw-8node-resume
OUTPUT_DIR = runs/dev/tw-8node-resume
NNODES = 8
NODES = batch-block1-1015 batch-block1-3297 batch-block1-3887 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3227
NODE_RANK = 2
GPUS_PER_NODE = 8
MASTER_ADDR = batch-block1-1015
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE =
GRADIENT_ACCUMULATION_STEPS =
PER_DEVICE_TRAIN_BATCH_SIZE =
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
SLURM_JOB_ID = 9168643
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
RUN_NAME = tw-8node-resume
OUTPUT_DIR = runs/dev/tw-8node-resume
NNODES = 8
SLURM_JOB_ID = 9168643
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
RUN_NAME = tw-8node-resume
OUTPUT_DIR = runs/dev/tw-8node-resume
NNODES = 8
SLURM_JOB_ID = 9168643
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
RUN_NAME = tw-8node-resume
OUTPUT_DIR = runs/dev/tw-8node-resume
NNODES = 8
SLURM_JOB_ID = 9168643
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
RUN_NAME = tw-8node-resume
OUTPUT_DIR = runs/dev/tw-8node-resume
NNODES = 8
SLURM_JOB_ID = 9168643
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
RUN_NAME = tw-8node-resume
OUTPUT_DIR = runs/dev/tw-8node-resume
NNODES = 8
SLURM_JOB_ID = 9168643
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
RUN_NAME = tw-8node-resume
OUTPUT_DIR = runs/dev/tw-8node-resume
NNODES = 8
NODES = batch-block1-1015 batch-block1-3297 batch-block1-3887 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3227
NODE_RANK = 6
GPUS_PER_NODE = 8
SLURM_JOB_ID = 9168643
SLURM_JOB_NAME = nvr_elm_llm:dev/tw-8node-resume
RUN_NAME = tw-8node-resume
OUTPUT_DIR = runs/dev/tw-8node-resume
NNODES = 8
NODES = batch-block1-1015 batch-block1-3297 batch-block1-3887 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3227
NODE_RANK = 4
GPUS_PER_NODE = 8
NODES = batch-block1-1015 batch-block1-3297 batch-block1-3887 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3227
NODE_RANK = 5
GPUS_PER_NODE = 8
NODES = batch-block1-1015 batch-block1-3297 batch-block1-3887 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3227
NODE_RANK = 1
GPUS_PER_NODE = 8
NODES = batch-block1-1015 batch-block1-3297 batch-block1-3887 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3227
NODE_RANK = 7
GPUS_PER_NODE = 8
NODES = batch-block1-1015 batch-block1-3297 batch-block1-3887 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3227
NODE_RANK = 3
GPUS_PER_NODE = 8
MASTER_ADDR = batch-block1-1015
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE =
GRADIENT_ACCUMULATION_STEPS =
PER_DEVICE_TRAIN_BATCH_SIZE =
NODES = batch-block1-1015 batch-block1-3297 batch-block1-3887 batch-block1-1134 batch-block1-3848 batch-block1-3273 batch-block1-1027 batch-block1-3227
NODE_RANK = 0
GPUS_PER_NODE = 8
MASTER_ADDR = batch-block1-1015
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE =
GRADIENT_ACCUMULATION_STEPS =
PER_DEVICE_TRAIN_BATCH_SIZE =
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
MASTER_ADDR = batch-block1-1015
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE =
GRADIENT_ACCUMULATION_STEPS =
PER_DEVICE_TRAIN_BATCH_SIZE =
MASTER_ADDR = batch-block1-1015
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE =
GRADIENT_ACCUMULATION_STEPS =
PER_DEVICE_TRAIN_BATCH_SIZE =
MASTER_ADDR = batch-block1-1015
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE =
GRADIENT_ACCUMULATION_STEPS =
PER_DEVICE_TRAIN_BATCH_SIZE =
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
MASTER_ADDR = batch-block1-1015
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE =
GRADIENT_ACCUMULATION_STEPS =
PER_DEVICE_TRAIN_BATCH_SIZE =
MASTER_ADDR = batch-block1-1015
MASTER_PORT = 25001
GLOBAL_TRAIN_BATCH_SIZE =
GRADIENT_ACCUMULATION_STEPS =
PER_DEVICE_TRAIN_BATCH_SIZE =
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
Resuming training from checkpoint: runs/dev/tw-8node-resume/checkpoint-64
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1Imported prefix tree collator v1
Imported prefix tree collator v1Imported prefix tree collator v1
Imported prefix tree collator v1Imported prefix tree collator v1
Imported prefix tree collator v1
Imported prefix tree collator v1
[2026-04-13 23:07:56,557] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:07:56,558] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:07:56,616] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:07:56,617] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:07:56,643] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:07:56,643] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:07:56,731] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:07:56,747] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:07:56,760] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:07:56,762] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:07:56,785] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:07:56,787] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:07:56,787] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:07:56,792] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:07:56,800] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:07:56,806] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:07:56,920] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:07:56,927] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:07:56,955] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:07:57,003] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:07:57,028] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:07:57,036] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:07:57,071] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:07:57,082] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:07:57,082] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:07:57,083] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:07:57,083] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:07:57,083] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:07:57,084] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:07:57,085] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:07:57,085] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:07:57,086] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:07:57,087] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:07:57,088] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:07:57,092] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:07:57,093] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:07:57,098] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:07:57,120] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:07:57,122] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:07:57,125] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:07:57,140] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:07:57,142] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:07:57,147] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:07:57,163] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:07:57,167] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:07:57,170] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:07:57,172] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:07:57,175] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:07:57,175] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:07:57,179] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:07:57,185] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:07:57,204] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:07:57,217] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:07:57,219] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:07:57,230] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:07:57,237] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:08:05,010] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:08:05,025] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:08:05,053] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:08:05,070] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:08:05,071] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:08:05,113] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:08:05,253] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:08:05,279] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:08:05,373] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:08:05,378] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:08:05,432] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:08:05,464] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:08:05,500] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:08:05,548] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:08:05,609] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:08:05,613] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:08:05,629] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:08:05,656] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:08:05,660] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:08:05,691] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:08:05,693] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:08:05,693] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:08:05,694] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:08:05,740] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:08:05,782] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:08:05,795] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:08:05,806] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:08:05,823] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:08:05,831] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:08:05,841] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:08:05,845] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:08:05,847] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:08:05,850] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:08:05,859] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:08:05,859] [INFO] [comm.py:706:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
[2026-04-13 23:08:05,881] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:08:05,893] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:08:05,948] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:08:05,965] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:08:05,970] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:08:05,989] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:08:06,001] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:08:06,007] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:08:06,011] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:08:06,013] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:08:06,039] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:08:06,078] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:08:06,118] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:08:06,128] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:08:06,129] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:08:06,131] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:08:06,132] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:08:06,145] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:08:06,193] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:08:06,197] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:08:06,240] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:08:06,257] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:08:06,371] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:08:06,391] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:08:06,431] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:08:06,444] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:08:06,492] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:08:06,500] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:08:06,506] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:08:06,510] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:08:06,514] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:08:06,518] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:08:06,518] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:08:06,518] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:08:06,520] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:08:06,523] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:08:06,528] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:08:06,529] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:08:06,540] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:08:06,546] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:08:06,550] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:08:06,551] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:08:06,555] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:08:06,579] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:08:06,609] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:08:06,630] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:08:06,645] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:08:06,697] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:08:06,714] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:08:06,714] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:08:06,732] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:08:06,735] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:08:06,740] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:08:06,740] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:08:06,755] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:08:13,160] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:08:13,184] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:08:13,184] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:08:13,185] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:08:13,224] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:08:13,239] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:08:13,240] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:08:13,243] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:08:13,243] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:08:13,245] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:08:13,254] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:08:13,264] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:08:13,272] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:08:13,291] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:08:13,452] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:08:13,457] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:08:17,477] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:08:17,531] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:08:17,547] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:08:17,548] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:08:17,558] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:08:17,645] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:08:17,648] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:08:18,097] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:08:18,097] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:08:18,097] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:08:18,097] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:08:18,097] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:08:18,097] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:08:18,097] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:08:18,098] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2026-04-13 23:08:30,097] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:08:30,098] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:08:30,098] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:08:30,099] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:08:30,109] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:08:30,110] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:08:30,110] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:08:30,110] [INFO] [comm.py:675:init_distributed] cdb=None
[2026-04-13 23:08:32,677] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:08:32,677] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:08:32,677] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:08:32,677] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:08:32,677] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:08:32,677] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:08:32,677] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:08:32,677] [INFO] [config.py:744:__init__] Config mesh_device None world_size = 64
[2026-04-13 23:08:36,881] [INFO] [partition_parameters.py:348:__exit__] finished initializing model - num_params = 399, num_elems = 8.19B
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Qwen templates for runs/dev/tw-8node-resume/checkpoint-64
Using Prefix Tree collator
args.report_to: ['wandb']
Using Prefix Tree collator
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3273
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1134
Using Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-3297
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3848
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']
args.report_to: ['wandb']
Using Prefix Tree collator
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3848
Using Prefix Tree collator
args.report_to: ['wandb']Using Prefix Tree collator
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1134
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-1134
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1015
args.report_to: ['wandb']
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1027
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3273
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-1015
Using Prefix Tree collator
args.report_to: ['wandb']
Using Prefix Tree collator
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-3297
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-1015
Using Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3848
Using Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3848
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3273
Using Prefix Tree collator
args.report_to: ['wandb']
args.report_to: ['wandb']
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-3297
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1134
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3273
Using Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-1015
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3273
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']
args.report_to: ['wandb']
Using Prefix Tree collator
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3273
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3273
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-3297
args.report_to: ['wandb']
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-1134
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3848
args.report_to: ['wandb']
Using Prefix Tree collator
args.report_to: ['wandb']
Using Prefix Tree collator
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-1015
Using Prefix Tree collator
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3848
args.report_to: ['wandb']
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1134
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3227
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3297
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']
args.report_to: ['wandb']
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1027
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3297
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3848
args.report_to: ['wandb']
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3227
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-1015
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1027
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3273
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3227
Using Prefix Tree collator
args.report_to: ['wandb']
Using Prefix Tree collator
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3227
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']
Using Prefix Tree collator
Using Prefix Tree collator
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-3297
Using Prefix Tree collator
args.report_to: ['wandb']
args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1027
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1027
args.report_to: ['wandb']
args.report_to: ['wandb']
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1027
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3227
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3227
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-3297
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3848
Using Prefix Tree collator
Using Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3227
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1027
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1015
Using Prefix Tree collator
Using Prefix Tree collator
args.report_to: ['wandb']
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1027
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-1015
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-1134
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-06_batch-block1-1134
Using Prefix Tree collator
Using Prefix Tree collatorUsing Prefix Tree collator
Using Prefix Tree collatorUsing Prefix Tree collator
Using Prefix Tree collatorUsing Prefix Tree collatorUsing Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-30_batch-block1-3887
args.report_to: ['wandb']
args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-30_batch-block1-3887
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-30_batch-block1-3887
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-30_batch-block1-3887
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-30_batch-block1-3887
args.report_to: ['wandb']
args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-30_batch-block1-3887
args.report_to: ['wandb']args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-30_batch-block1-3887
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-30_batch-block1-3887
Using Prefix Tree collator
args.report_to: ['wandb']
args.logging_dir: runs/dev/tw-8node-resume/runs/Apr13_23-08-05_batch-block1-3227
Parameter Offload: Total persistent parameters: 308224 in 145 params
{'train_runtime': 3.1214, 'train_samples_per_second': 2470.663, 'train_steps_per_second': 20.503, 'train_loss': 0.0, 'epoch': 8.0}
wandb:
wandb: 🚀 View run runs/dev/tw-8node-resume at: https://wandb.ai/ligeng-zhu/ThreadWeaver/runs/tw-8node-resume

124
special_tokens_map.json Normal file
View File

@@ -0,0 +1,124 @@
{
"additional_special_tokens": [
{
"content": "<Think>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
{
"content": "</Think>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
{
"content": "<Parallel>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
{
"content": "</Parallel>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
{
"content": "<Outlines>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
{
"content": "</Outlines>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
{
"content": "<Outline>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
{
"content": "</Outline>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
{
"content": "<Trial>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
{
"content": "</Trial>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
{
"content": "<Subtask>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
{
"content": "</Subtask>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
{
"content": "<Thread>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
{
"content": "</Thread>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
{
"content": "<Conclusion>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
{
"content": "</Conclusion>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
}
],
"eos_token": {
"content": "<|im_end|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
"pad_token": "<|fim_pad|>"
}

3
tokenizer.json Normal file
View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:0233e510f0f3cba54c9223bf44453616eb93c1219f688a05efb426e217cee80a
size 11425637

371
tokenizer_config.json Normal file
View File

@@ -0,0 +1,371 @@
{
"add_bos_token": false,
"add_prefix_space": false,
"added_tokens_decoder": {
"151643": {
"content": "<|endoftext|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151644": {
"content": "<|im_start|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151645": {
"content": "<|im_end|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151646": {
"content": "<|object_ref_start|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151647": {
"content": "<|object_ref_end|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151648": {
"content": "<|box_start|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151649": {
"content": "<|box_end|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151650": {
"content": "<|quad_start|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151651": {
"content": "<|quad_end|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151652": {
"content": "<|vision_start|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151653": {
"content": "<|vision_end|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151654": {
"content": "<|vision_pad|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151655": {
"content": "<|image_pad|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151656": {
"content": "<|video_pad|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151657": {
"content": "<tool_call>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151658": {
"content": "</tool_call>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151659": {
"content": "<|fim_prefix|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151660": {
"content": "<|fim_middle|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151661": {
"content": "<|fim_suffix|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151662": {
"content": "<|fim_pad|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151663": {
"content": "<|repo_name|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151664": {
"content": "<|file_sep|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151665": {
"content": "<tool_response>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151666": {
"content": "</tool_response>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151667": {
"content": "<think>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151668": {
"content": "</think>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151669": {
"content": "<Think>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151670": {
"content": "</Think>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151671": {
"content": "<Parallel>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151672": {
"content": "</Parallel>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151673": {
"content": "<Outlines>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151674": {
"content": "</Outlines>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151675": {
"content": "<Outline>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151676": {
"content": "</Outline>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151677": {
"content": "<Trial>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151678": {
"content": "</Trial>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151679": {
"content": "<Subtask>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151680": {
"content": "</Subtask>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151681": {
"content": "<Thread>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151682": {
"content": "</Thread>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151683": {
"content": "<Conclusion>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151684": {
"content": "</Conclusion>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
}
},
"additional_special_tokens": [
"<Think>",
"</Think>",
"<Parallel>",
"</Parallel>",
"<Outlines>",
"</Outlines>",
"<Outline>",
"</Outline>",
"<Trial>",
"</Trial>",
"<Subtask>",
"</Subtask>",
"<Thread>",
"</Thread>",
"<Conclusion>",
"</Conclusion>"
],
"bos_token": null,
"chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0].role == 'system' %}\n {{- messages[0].content + '\\n\\n' }}\n {%- endif %}\n {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0].role == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0].content + '<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n{%- for message in messages[::-1] %}\n {%- set index = (messages|length - 1) - loop.index0 %}\n {%- if ns.multi_step_tool and message.role == \"user\" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}\n {%- set ns.multi_step_tool = false %}\n {%- set ns.last_query_index = index %}\n {%- endif %}\n{%- endfor %}\n{%- for message in messages %}\n {%- if message.content is string %}\n {%- set content = message.content %}\n {%- else %}\n {%- set content = '' %}\n {%- endif %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n {{- '<|im_start|>' + message.role + '\\n' + content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {%- set reasoning_content = '' %}\n {%- if message.reasoning_content is string %}\n {%- set reasoning_content = message.reasoning_content %}\n {%- else %}\n {%- if '</think>' in content %}\n {%- set reasoning_content = content.split('</think>')[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') %}\n {%- set content = content.split('</think>')[-1].lstrip('\\n') %}\n {%- endif %}\n {%- endif %}\n {%- if loop.index0 > ns.last_query_index %}\n {%- if loop.last or (not loop.last and reasoning_content) %}\n {{- '<|im_start|>' + message.role + '\\n<think>\\n' + reasoning_content.strip('\\n') + '\\n</think>\\n\\n' + content.lstrip('\\n') }}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- if message.tool_calls %}\n {%- for tool_call in message.tool_calls %}\n {%- if (loop.first and content) or (not loop.first) %}\n {{- '\\n' }}\n {%- endif %}\n {%- if tool_call.function %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {%- if tool_call.arguments is string %}\n {{- tool_call.arguments }}\n {%- else %}\n {{- tool_call.arguments | tojson }}\n {%- endif %}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {%- endif %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n {%- if enable_thinking is defined and enable_thinking is false %}\n {{- '<think>\\n\\n</think>\\n\\n' }}\n {%- endif %}\n{%- endif %}",
"clean_up_tokenization_spaces": false,
"eos_token": "<|im_end|>",
"errors": "replace",
"extra_special_tokens": {},
"model_max_length": 131072,
"pad_token": "<|fim_pad|>",
"split_special_tokens": false,
"tokenizer_class": "Qwen2Tokenizer",
"unk_token": null
}

3
training_args.bin Normal file
View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:9ac6b9f2dd0653e4ac399a58c1b8c8a35aabd786bd251f673080a5b3f944d985
size 7672

1
vocab.json Normal file

File diff suppressed because one or more lines are too long