2466 lines
416 KiB
Plaintext
2466 lines
416 KiB
Plaintext
|
|
create a temporary directory: /tmp/tmp3n6qz8ik
|
|||
|
|
loading /media/hangyu5/Home/Documents/Hugging-Face/LM_cocktail/meow -----------------
|
|||
|
|
Loading checkpoint shards: 0%| | 0/5 [00:00<?, ?it/s]
Loading checkpoint shards: 20%|██ | 1/5 [00:02<00:08, 2.11s/it]
Loading checkpoint shards: 40%|████ | 2/5 [00:04<00:06, 2.17s/it]
Loading checkpoint shards: 60%|██████ | 3/5 [00:06<00:04, 2.17s/it]
Loading checkpoint shards: 80%|████████ | 4/5 [00:08<00:02, 2.03s/it]
Loading checkpoint shards: 100%|██████████| 5/5 [00:08<00:00, 1.51s/it]
Loading checkpoint shards: 100%|██████████| 5/5 [00:08<00:00, 1.78s/it]
|
|||
|
|
loading /media/hangyu5/Home/Documents/Hugging-Face/LM_cocktail/SOLAR-10.7B-Instruct-v1.0 -----------------
|
|||
|
|
Loading checkpoint shards: 0%| | 0/5 [00:00<?, ?it/s]
Loading checkpoint shards: 20%|██ | 1/5 [00:00<00:03, 1.22it/s]
Loading checkpoint shards: 40%|████ | 2/5 [00:01<00:02, 1.24it/s]
Loading checkpoint shards: 60%|██████ | 3/5 [00:02<00:01, 1.27it/s]
Loading checkpoint shards: 80%|████████ | 4/5 [00:03<00:00, 1.28it/s]
Loading checkpoint shards: 100%|██████████| 5/5 [00:04<00:00, 1.18it/s]
Loading checkpoint shards: 100%|██████████| 5/5 [00:04<00:00, 1.21it/s]
|
|||
|
|
Merging models: 0%| | 0/435 [00:00<?, ?it/s]
|
|||
|
|
Processing model.layers.34.mlp.down_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.34.mlp.down_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 5.94it/s][A
|
|||
|
|
Processing model.layers.34.mlp.down_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 6.40it/s][A
|
|||
|
|
[A
Merging models: 0%| | 1/435 [00:00<02:17, 3.16it/s]
|
|||
|
|
Processing model.layers.41.mlp.down_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.41.mlp.down_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 7.49it/s][A
|
|||
|
|
Processing model.layers.41.mlp.down_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.04it/s][A
|
|||
|
|
[A
Merging models: 0%| | 2/435 [00:00<02:08, 3.38it/s]
|
|||
|
|
Processing model.layers.26.mlp.down_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.26.mlp.down_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 7.43it/s][A
|
|||
|
|
Processing model.layers.26.mlp.down_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.11it/s][A
|
|||
|
|
[A
Merging models: 1%| | 3/435 [00:00<02:04, 3.46it/s]
|
|||
|
|
Processing model.layers.5.mlp.gate_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.5.mlp.gate_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 4.67it/s][A
|
|||
|
|
Processing model.layers.5.mlp.gate_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 5.47it/s][A
|
|||
|
|
[A
Merging models: 1%| | 4/435 [00:01<02:19, 3.09it/s]
|
|||
|
|
Processing model.layers.16.self_attn.k_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.16.self_attn.k_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 10.67it/s][A
|
|||
|
|
[A
Merging models: 1%| | 5/435 [00:01<01:57, 3.64it/s]
|
|||
|
|
Processing model.layers.37.mlp.down_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.37.mlp.down_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 7.49it/s][A
|
|||
|
|
Processing model.layers.37.mlp.down_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 5.12it/s][A
|
|||
|
|
[A
Merging models: 1%|▏ | 6/435 [00:01<02:12, 3.25it/s]
|
|||
|
|
Processing model.layers.25.mlp.gate_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.25.mlp.gate_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 6.97it/s][A
|
|||
|
|
Processing model.layers.25.mlp.gate_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.98it/s][A
|
|||
|
|
[A
Merging models: 2%|▏ | 7/435 [00:02<02:22, 3.00it/s]
|
|||
|
|
Processing model.layers.18.self_attn.v_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.18.self_attn.v_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 10.58it/s][A
|
|||
|
|
[A
Merging models: 2%|▏ | 8/435 [00:02<02:02, 3.48it/s]
|
|||
|
|
Processing model.layers.12.self_attn.q_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.12.self_attn.q_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 6.91it/s][A
|
|||
|
|
Processing model.layers.12.self_attn.q_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.81it/s][A
|
|||
|
|
[A
Merging models: 2%|▏ | 9/435 [00:02<01:58, 3.58it/s]
|
|||
|
|
Processing model.layers.14.mlp.down_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.14.mlp.down_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 4.23it/s][A
|
|||
|
|
Processing model.layers.14.mlp.down_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.21it/s][A
|
|||
|
|
[A
Merging models: 2%|▏ | 10/435 [00:03<02:24, 2.94it/s]
|
|||
|
|
Processing model.layers.11.mlp.up_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.11.mlp.up_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 3.56it/s][A
|
|||
|
|
Processing model.layers.11.mlp.up_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.11it/s][A
|
|||
|
|
[A
Merging models: 3%|▎ | 11/435 [00:03<02:44, 2.58it/s]
|
|||
|
|
Processing model.layers.26.mlp.gate_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.26.mlp.gate_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 6.95it/s][A
|
|||
|
|
Processing model.layers.26.mlp.gate_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 6.34it/s][A
|
|||
|
|
[A
Merging models: 3%|▎ | 12/435 [00:03<02:34, 2.74it/s]
|
|||
|
|
Processing model.layers.5.post_attention_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.5.post_attention_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 11.03it/s][A
|
|||
|
|
[A
Merging models: 3%|▎ | 13/435 [00:04<02:10, 3.23it/s]
|
|||
|
|
Processing model.layers.32.mlp.down_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.32.mlp.down_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 7.10it/s][A
|
|||
|
|
Processing model.layers.32.mlp.down_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 5.06it/s][A
|
|||
|
|
[A
Merging models: 3%|▎ | 14/435 [00:04<02:19, 3.03it/s]
|
|||
|
|
Processing model.layers.27.input_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.27.input_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 11.15it/s][A
|
|||
|
|
[A
Merging models: 3%|▎ | 15/435 [00:04<01:59, 3.51it/s]
|
|||
|
|
Processing model.layers.3.mlp.up_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.3.mlp.up_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 4.39it/s][A
|
|||
|
|
Processing model.layers.3.mlp.up_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.29it/s][A
|
|||
|
|
[A
Merging models: 4%|▎ | 16/435 [00:05<02:22, 2.95it/s]
|
|||
|
|
Processing model.layers.44.post_attention_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.44.post_attention_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 10.99it/s][A
|
|||
|
|
[A
Merging models: 4%|▍ | 17/435 [00:05<02:02, 3.43it/s]
|
|||
|
|
Processing model.layers.30.post_attention_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.30.post_attention_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 10.93it/s][A
|
|||
|
|
[A
Merging models: 4%|▍ | 18/435 [00:05<01:48, 3.86it/s]
|
|||
|
|
Processing model.layers.34.self_attn.k_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.34.self_attn.k_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 10.54it/s][A
|
|||
|
|
[A
Merging models: 4%|▍ | 19/435 [00:05<01:39, 4.19it/s]
|
|||
|
|
Processing model.layers.6.mlp.down_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.6.mlp.down_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 3.55it/s][A
|
|||
|
|
Processing model.layers.6.mlp.down_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 3.89it/s][A
|
|||
|
|
[A
Merging models: 5%|▍ | 20/435 [00:06<02:14, 3.09it/s]
|
|||
|
|
Processing model.layers.13.post_attention_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.13.post_attention_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 10.99it/s][A
|
|||
|
|
[A
Merging models: 5%|▍ | 21/435 [00:06<01:56, 3.56it/s]
|
|||
|
|
Processing model.layers.28.self_attn.q_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.28.self_attn.q_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 9.24it/s][A
|
|||
|
|
Processing model.layers.28.self_attn.q_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 9.04it/s][A
|
|||
|
|
[A
Merging models: 5%|▌ | 22/435 [00:06<01:48, 3.80it/s]
|
|||
|
|
Processing model.layers.19.self_attn.k_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.19.self_attn.k_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 10.65it/s][A
|
|||
|
|
[A
Merging models: 5%|▌ | 23/435 [00:06<01:39, 4.16it/s]
|
|||
|
|
Processing model.layers.25.input_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.25.input_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 11.18it/s][A
|
|||
|
|
[A
Merging models: 6%|▌ | 24/435 [00:06<01:31, 4.50it/s]
|
|||
|
|
Processing model.layers.45.post_attention_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.45.post_attention_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 9.20it/s][A
|
|||
|
|
[A
Merging models: 6%|▌ | 25/435 [00:07<01:30, 4.53it/s]
|
|||
|
|
Processing model.layers.35.post_attention_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.35.post_attention_layernorm.weight: 50%|█████ | 1/2 [00:00<00:00, 8.21it/s][A
|
|||
|
|
Processing model.layers.35.post_attention_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 8.04it/s][A
|
|||
|
|
[A
Merging models: 6%|▌ | 26/435 [00:07<01:33, 4.36it/s]
|
|||
|
|
Processing model.layers.45.self_attn.v_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.45.self_attn.v_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 8.33it/s][A
|
|||
|
|
Processing model.layers.45.self_attn.v_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 8.60it/s][A
|
|||
|
|
[A
Merging models: 6%|▌ | 27/435 [00:07<01:34, 4.33it/s]
|
|||
|
|
Processing model.layers.41.self_attn.k_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.41.self_attn.k_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 9.22it/s][A
|
|||
|
|
[A
Merging models: 6%|▋ | 28/435 [00:07<01:31, 4.47it/s]
|
|||
|
|
Processing model.layers.20.mlp.up_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.20.mlp.up_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 6.98it/s][A
|
|||
|
|
Processing model.layers.20.mlp.up_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 6.42it/s][A
|
|||
|
|
[A
Merging models: 7%|▋ | 29/435 [00:08<01:41, 4.01it/s]
|
|||
|
|
Processing model.layers.32.input_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.32.input_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 11.14it/s][A
|
|||
|
|
[A
Merging models: 7%|▋ | 30/435 [00:08<01:32, 4.38it/s]
|
|||
|
|
Processing model.layers.3.self_attn.q_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.3.self_attn.q_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 7.33it/s][A
|
|||
|
|
Processing model.layers.3.self_attn.q_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 8.03it/s][A
|
|||
|
|
[A
Merging models: 7%|▋ | 31/435 [00:08<01:35, 4.24it/s]
|
|||
|
|
Processing model.layers.28.mlp.up_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.28.mlp.up_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 7.01it/s][A
|
|||
|
|
Processing model.layers.28.mlp.up_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 5.06it/s][A
|
|||
|
|
[A
Merging models: 7%|▋ | 32/435 [00:09<01:52, 3.59it/s]
|
|||
|
|
Processing model.layers.10.self_attn.q_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.10.self_attn.q_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 7.37it/s][A
|
|||
|
|
Processing model.layers.10.self_attn.q_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 8.05it/s][A
|
|||
|
|
[A
Merging models: 8%|▊ | 33/435 [00:09<01:48, 3.69it/s]
|
|||
|
|
Processing model.layers.20.mlp.gate_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.20.mlp.gate_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 4.69it/s][A
|
|||
|
|
Processing model.layers.20.mlp.gate_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.38it/s][A
|
|||
|
|
[A
Merging models: 8%|▊ | 34/435 [00:09<02:10, 3.07it/s]
|
|||
|
|
Processing model.layers.46.self_attn.k_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.46.self_attn.k_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 10.57it/s][A
|
|||
|
|
[A
Merging models: 8%|▊ | 35/435 [00:09<01:53, 3.51it/s]
|
|||
|
|
Processing model.layers.23.input_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.23.input_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 11.10it/s][A
|
|||
|
|
[A
Merging models: 8%|▊ | 36/435 [00:10<01:41, 3.95it/s]
|
|||
|
|
Processing model.layers.8.self_attn.k_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.8.self_attn.k_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 9.72it/s][A
|
|||
|
|
[A
Merging models: 9%|▊ | 37/435 [00:10<01:34, 4.22it/s]
|
|||
|
|
Processing model.layers.3.self_attn.v_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.3.self_attn.v_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 9.72it/s][A
|
|||
|
|
[A
Merging models: 9%|▊ | 38/435 [00:10<01:29, 4.44it/s]
|
|||
|
|
Processing model.layers.7.self_attn.v_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.7.self_attn.v_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 9.71it/s][A
|
|||
|
|
[A
Merging models: 9%|▉ | 39/435 [00:10<01:26, 4.60it/s]
|
|||
|
|
Processing model.layers.43.self_attn.q_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.43.self_attn.q_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 8.91it/s][A
|
|||
|
|
Processing model.layers.43.self_attn.q_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 8.87it/s][A
|
|||
|
|
[A
Merging models: 9%|▉ | 40/435 [00:10<01:26, 4.55it/s]
|
|||
|
|
Processing model.layers.25.self_attn.o_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.25.self_attn.o_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 9.38it/s][A
|
|||
|
|
Processing model.layers.25.self_attn.o_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 9.18it/s][A
|
|||
|
|
[A
Merging models: 9%|▉ | 41/435 [00:11<01:26, 4.56it/s]
|
|||
|
|
Processing model.layers.26.self_attn.o_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.26.self_attn.o_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 9.01it/s][A
|
|||
|
|
Processing model.layers.26.self_attn.o_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 8.86it/s][A
|
|||
|
|
[A
Merging models: 10%|▉ | 42/435 [00:11<01:26, 4.52it/s]
|
|||
|
|
Processing model.layers.17.self_attn.k_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.17.self_attn.k_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 10.58it/s][A
|
|||
|
|
[A
Merging models: 10%|▉ | 43/435 [00:11<01:22, 4.73it/s]
|
|||
|
|
Processing model.layers.44.mlp.down_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.44.mlp.down_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 6.98it/s][A
|
|||
|
|
Processing model.layers.44.mlp.down_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 5.14it/s][A
|
|||
|
|
[A
Merging models: 10%|█ | 44/435 [00:11<01:41, 3.84it/s]
|
|||
|
|
Processing model.layers.46.mlp.down_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.46.mlp.down_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 4.74it/s][A
|
|||
|
|
Processing model.layers.46.mlp.down_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.46it/s][A
|
|||
|
|
[A
Merging models: 10%|█ | 45/435 [00:12<02:03, 3.17it/s]
|
|||
|
|
Processing model.layers.41.self_attn.q_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.41.self_attn.q_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 9.10it/s][A
|
|||
|
|
Processing model.layers.41.self_attn.q_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 9.02it/s][A
|
|||
|
|
[A
Merging models: 11%|█ | 46/435 [00:12<01:51, 3.48it/s]
|
|||
|
|
Processing model.layers.46.input_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.46.input_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 11.03it/s][A
|
|||
|
|
[A
Merging models: 11%|█ | 47/435 [00:12<01:39, 3.91it/s]
|
|||
|
|
Processing model.layers.34.input_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.34.input_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 11.03it/s][A
|
|||
|
|
[A
Merging models: 11%|█ | 48/435 [00:12<01:30, 4.28it/s]
|
|||
|
|
Processing model.layers.13.self_attn.q_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.13.self_attn.q_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 7.41it/s][A
|
|||
|
|
Processing model.layers.13.self_attn.q_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.98it/s][A
|
|||
|
|
[A
Merging models: 11%|█▏ | 49/435 [00:13<01:32, 4.17it/s]
|
|||
|
|
Processing model.layers.28.mlp.down_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.28.mlp.down_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 4.65it/s][A
|
|||
|
|
Processing model.layers.28.mlp.down_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.30it/s][A
|
|||
|
|
[A
Merging models: 11%|█▏ | 50/435 [00:13<01:57, 3.27it/s]
|
|||
|
|
Processing model.layers.34.post_attention_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.34.post_attention_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 11.05it/s][A
|
|||
|
|
[A
Merging models: 12%|█▏ | 51/435 [00:13<01:43, 3.72it/s]
|
|||
|
|
Processing model.layers.28.self_attn.v_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.28.self_attn.v_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 10.61it/s][A
|
|||
|
|
[A
Merging models: 12%|█▏ | 52/435 [00:14<01:33, 4.09it/s]
|
|||
|
|
Processing model.layers.6.post_attention_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.6.post_attention_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 10.95it/s][A
|
|||
|
|
[A
Merging models: 12%|█▏ | 53/435 [00:14<01:26, 4.42it/s]
|
|||
|
|
Processing model.layers.4.self_attn.v_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.4.self_attn.v_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 9.77it/s][A
|
|||
|
|
Processing model.layers.4.self_attn.v_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 9.73it/s][A
|
|||
|
|
[A
Merging models: 12%|█▏ | 54/435 [00:14<01:23, 4.54it/s]
|
|||
|
|
Processing model.layers.33.input_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.33.input_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 10.92it/s][A
|
|||
|
|
[A
Merging models: 13%|█▎ | 55/435 [00:14<01:19, 4.78it/s]
|
|||
|
|
Processing model.layers.46.mlp.gate_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.46.mlp.gate_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 4.65it/s][A
|
|||
|
|
Processing model.layers.46.mlp.gate_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.47it/s][A
|
|||
|
|
[A
Merging models: 13%|█▎ | 56/435 [00:15<01:46, 3.57it/s]
|
|||
|
|
Processing model.layers.4.self_attn.o_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.4.self_attn.o_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 7.41it/s][A
|
|||
|
|
Processing model.layers.4.self_attn.o_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 8.33it/s][A
|
|||
|
|
[A
Merging models: 13%|█▎ | 57/435 [00:15<01:41, 3.71it/s]
|
|||
|
|
Processing model.layers.15.mlp.up_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.15.mlp.up_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 4.88it/s][A
|
|||
|
|
Processing model.layers.15.mlp.up_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.56it/s][A
|
|||
|
|
[A
Merging models: 13%|█▎ | 58/435 [00:15<02:00, 3.13it/s]
|
|||
|
|
Processing model.layers.19.self_attn.o_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.19.self_attn.o_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 8.19it/s][A
|
|||
|
|
Processing model.layers.19.self_attn.o_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.52it/s][A
|
|||
|
|
[A
Merging models: 14%|█▎ | 59/435 [00:15<01:53, 3.31it/s]
|
|||
|
|
Processing model.layers.18.mlp.up_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.18.mlp.up_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 4.80it/s][A
|
|||
|
|
Processing model.layers.18.mlp.up_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.52it/s][A
|
|||
|
|
[A
Merging models: 14%|█▍ | 60/435 [00:16<02:08, 2.91it/s]
|
|||
|
|
Processing model.layers.11.mlp.down_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.11.mlp.down_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 3.63it/s][A
|
|||
|
|
Processing model.layers.11.mlp.down_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.03it/s][A
|
|||
|
|
[A
Merging models: 14%|█▍ | 61/435 [00:16<02:26, 2.55it/s]
|
|||
|
|
Processing model.layers.14.self_attn.o_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.14.self_attn.o_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 6.71it/s][A
|
|||
|
|
Processing model.layers.14.self_attn.o_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 6.93it/s][A
|
|||
|
|
[A
Merging models: 14%|█▍ | 62/435 [00:17<02:14, 2.77it/s]
|
|||
|
|
Processing model.layers.6.self_attn.k_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.6.self_attn.k_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 9.17it/s][A
|
|||
|
|
[A
Merging models: 14%|█▍ | 63/435 [00:17<01:57, 3.18it/s]
|
|||
|
|
Processing model.layers.41.post_attention_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.41.post_attention_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 11.12it/s][A
|
|||
|
|
[A
Merging models: 15%|█▍ | 64/435 [00:17<01:41, 3.64it/s]
|
|||
|
|
Processing model.layers.2.post_attention_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.2.post_attention_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 10.93it/s][A
|
|||
|
|
[A
Merging models: 15%|█▍ | 65/435 [00:17<01:31, 4.05it/s]
|
|||
|
|
Processing model.layers.44.self_attn.v_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.44.self_attn.v_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 9.87it/s][A
|
|||
|
|
[A
Merging models: 15%|█▌ | 66/435 [00:17<01:25, 4.30it/s]
|
|||
|
|
Processing model.layers.18.self_attn.k_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.18.self_attn.k_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 9.85it/s][A
|
|||
|
|
[A
Merging models: 15%|█▌ | 67/435 [00:18<01:21, 4.49it/s]
|
|||
|
|
Processing model.layers.12.self_attn.v_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.12.self_attn.v_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 9.73it/s][A
|
|||
|
|
Processing model.layers.12.self_attn.v_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 9.71it/s][A
|
|||
|
|
[A
Merging models: 16%|█▌ | 68/435 [00:18<01:19, 4.59it/s]
|
|||
|
|
Processing model.layers.41.mlp.up_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.41.mlp.up_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 4.92it/s][A
|
|||
|
|
Processing model.layers.41.mlp.up_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.60it/s][A
|
|||
|
|
[A
Merging models: 16%|█▌ | 69/435 [00:18<01:43, 3.55it/s]
|
|||
|
|
Processing model.layers.0.self_attn.k_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.0.self_attn.k_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 9.72it/s][A
|
|||
|
|
Processing model.layers.0.self_attn.k_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 9.77it/s][A
|
|||
|
|
[A
Merging models: 16%|█▌ | 70/435 [00:19<01:34, 3.86it/s]
|
|||
|
|
Processing model.layers.0.mlp.gate_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.0.mlp.gate_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 3.69it/s][A
|
|||
|
|
Processing model.layers.0.mlp.gate_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.06it/s][A
|
|||
|
|
[A
Merging models: 16%|█▋ | 71/435 [00:19<02:00, 3.02it/s]
|
|||
|
|
Processing model.layers.10.mlp.up_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.10.mlp.up_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 3.65it/s][A
|
|||
|
|
Processing model.layers.10.mlp.up_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.01it/s][A
|
|||
|
|
[A
Merging models: 17%|█▋ | 72/435 [00:20<02:19, 2.60it/s]
|
|||
|
|
Processing model.layers.15.self_attn.k_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.15.self_attn.k_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 10.56it/s][A
|
|||
|
|
[A
Merging models: 17%|█▋ | 73/435 [00:20<01:57, 3.07it/s]
|
|||
|
|
Processing model.layers.24.self_attn.v_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.24.self_attn.v_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 9.77it/s][A
|
|||
|
|
[A
Merging models: 17%|█▋ | 74/435 [00:20<01:43, 3.48it/s]
|
|||
|
|
Processing model.layers.24.self_attn.q_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.24.self_attn.q_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 8.82it/s][A
|
|||
|
|
Processing model.layers.24.self_attn.q_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.83it/s][A
|
|||
|
|
[A
Merging models: 17%|█▋ | 75/435 [00:20<01:39, 3.61it/s]
|
|||
|
|
Processing model.layers.1.input_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.1.input_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 10.85it/s][A
|
|||
|
|
[A
Merging models: 17%|█▋ | 76/435 [00:20<01:29, 4.01it/s]
|
|||
|
|
Processing model.layers.4.self_attn.k_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.4.self_attn.k_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 9.28it/s][A
|
|||
|
|
[A
Merging models: 18%|█▊ | 77/435 [00:21<01:24, 4.25it/s]
|
|||
|
|
Processing model.layers.1.mlp.up_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.1.mlp.up_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 3.76it/s][A
|
|||
|
|
Processing model.layers.1.mlp.up_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.11it/s][A
|
|||
|
|
[A
Merging models: 18%|█▊ | 78/435 [00:21<01:51, 3.19it/s]
|
|||
|
|
Processing model.layers.19.self_attn.v_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.19.self_attn.v_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 10.60it/s][A
|
|||
|
|
[A
Merging models: 18%|█▊ | 79/435 [00:21<01:38, 3.62it/s]
|
|||
|
|
Processing model.layers.16.input_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.16.input_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 11.10it/s][A
|
|||
|
|
[A
Merging models: 18%|█▊ | 80/435 [00:21<01:27, 4.04it/s]
|
|||
|
|
Processing model.layers.31.mlp.up_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.31.mlp.up_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 4.86it/s][A
|
|||
|
|
Processing model.layers.31.mlp.up_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.58it/s][A
|
|||
|
|
[A
Merging models: 19%|█▊ | 81/435 [00:22<01:47, 3.30it/s]
|
|||
|
|
Processing model.layers.17.post_attention_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.17.post_attention_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 11.10it/s][A
|
|||
|
|
[A
Merging models: 19%|█▉ | 82/435 [00:22<01:34, 3.75it/s]
|
|||
|
|
Processing model.layers.22.self_attn.k_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.22.self_attn.k_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 10.65it/s][A
|
|||
|
|
[A
Merging models: 19%|█▉ | 83/435 [00:22<01:25, 4.12it/s]
|
|||
|
|
Processing model.layers.29.post_attention_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.29.post_attention_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 11.10it/s][A
|
|||
|
|
[A
Merging models: 19%|█▉ | 84/435 [00:22<01:18, 4.46it/s]
|
|||
|
|
Processing model.layers.33.self_attn.o_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.33.self_attn.o_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 9.01it/s][A
|
|||
|
|
Processing model.layers.33.self_attn.o_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 8.00it/s][A
|
|||
|
|
[A
Merging models: 20%|█▉ | 85/435 [00:23<01:20, 4.33it/s]
|
|||
|
|
Processing model.layers.7.self_attn.o_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.7.self_attn.o_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 6.52it/s][A
|
|||
|
|
Processing model.layers.7.self_attn.o_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 6.86it/s][A
|
|||
|
|
[A
Merging models: 20%|█▉ | 86/435 [00:23<01:27, 4.00it/s]
|
|||
|
|
Processing model.layers.32.mlp.up_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.32.mlp.up_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 4.83it/s][A
|
|||
|
|
Processing model.layers.32.mlp.up_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.62it/s][A
|
|||
|
|
[A
Merging models: 20%|██ | 87/435 [00:23<01:45, 3.29it/s]
|
|||
|
|
Processing model.layers.30.mlp.up_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.30.mlp.up_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 4.99it/s][A
|
|||
|
|
Processing model.layers.30.mlp.up_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.73it/s][A
|
|||
|
|
[A
Merging models: 20%|██ | 88/435 [00:24<01:57, 2.95it/s]
|
|||
|
|
Processing model.layers.32.mlp.gate_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.32.mlp.gate_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 4.99it/s][A
|
|||
|
|
Processing model.layers.32.mlp.gate_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.68it/s][A
|
|||
|
|
[A
Merging models: 20%|██ | 89/435 [00:24<02:06, 2.75it/s]
|
|||
|
|
Processing model.layers.33.self_attn.v_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.33.self_attn.v_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 9.96it/s][A
|
|||
|
|
[A
Merging models: 21%|██ | 90/435 [00:24<01:48, 3.18it/s]
|
|||
|
|
Processing model.layers.21.self_attn.k_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.21.self_attn.k_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 10.15it/s][A
|
|||
|
|
[A
Merging models: 21%|██ | 91/435 [00:25<01:36, 3.58it/s]
|
|||
|
|
Processing model.layers.26.self_attn.v_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.26.self_attn.v_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 10.18it/s][A
|
|||
|
|
[A
Merging models: 21%|██ | 92/435 [00:25<01:27, 3.93it/s]
|
|||
|
|
Processing model.layers.1.self_attn.q_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.1.self_attn.q_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 5.07it/s][A
|
|||
|
|
Processing model.layers.1.self_attn.q_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 6.19it/s][A
|
|||
|
|
[A
Merging models: 21%|██▏ | 93/435 [00:25<01:35, 3.59it/s]
|
|||
|
|
Processing model.layers.35.self_attn.k_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.35.self_attn.k_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 10.64it/s][A
|
|||
|
|
[A
Merging models: 22%|██▏ | 94/435 [00:25<01:25, 3.98it/s]
|
|||
|
|
Processing model.layers.34.self_attn.o_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.34.self_attn.o_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 7.83it/s][A
|
|||
|
|
Processing model.layers.34.self_attn.o_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.54it/s][A
|
|||
|
|
[A
Merging models: 22%|██▏ | 95/435 [00:26<01:26, 3.92it/s]
|
|||
|
|
Processing model.layers.24.mlp.gate_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.24.mlp.gate_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 4.87it/s][A
|
|||
|
|
Processing model.layers.24.mlp.gate_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.61it/s][A
|
|||
|
|
[A
Merging models: 22%|██▏ | 96/435 [00:26<01:44, 3.25it/s]
|
|||
|
|
Processing model.layers.19.post_attention_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.19.post_attention_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 11.12it/s][A
|
|||
|
|
[A
Merging models: 22%|██▏ | 97/435 [00:26<01:31, 3.71it/s]
|
|||
|
|
Processing model.layers.25.self_attn.q_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.25.self_attn.q_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 8.24it/s][A
|
|||
|
|
Processing model.layers.25.self_attn.q_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.63it/s][A
|
|||
|
|
[A
Merging models: 23%|██▎ | 98/435 [00:26<01:29, 3.75it/s]
|
|||
|
|
Processing model.layers.12.self_attn.o_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.12.self_attn.o_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 6.87it/s][A
|
|||
|
|
Processing model.layers.12.self_attn.o_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.14it/s][A
|
|||
|
|
[A
Merging models: 23%|██▎ | 99/435 [00:27<01:31, 3.69it/s]
|
|||
|
|
Processing model.layers.31.mlp.down_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.31.mlp.down_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 4.82it/s][A
|
|||
|
|
Processing model.layers.31.mlp.down_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.59it/s][A
|
|||
|
|
[A
Merging models: 23%|██▎ | 100/435 [00:27<01:47, 3.13it/s]
|
|||
|
|
Processing model.layers.37.post_attention_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.37.post_attention_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 11.11it/s][A
|
|||
|
|
[A
Merging models: 23%|██▎ | 101/435 [00:27<01:32, 3.60it/s]
|
|||
|
|
Processing model.layers.27.self_attn.q_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.27.self_attn.q_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 7.69it/s][A
|
|||
|
|
Processing model.layers.27.self_attn.q_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.40it/s][A
|
|||
|
|
[A
Merging models: 23%|██▎ | 102/435 [00:28<01:31, 3.63it/s]
|
|||
|
|
Processing model.layers.28.input_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.28.input_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 11.12it/s][A
|
|||
|
|
[A
Merging models: 24%|██▎ | 103/435 [00:28<01:21, 4.05it/s]
|
|||
|
|
Processing model.layers.17.mlp.down_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.17.mlp.down_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 4.78it/s][A
|
|||
|
|
Processing model.layers.17.mlp.down_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.60it/s][A
|
|||
|
|
[A
Merging models: 24%|██▍ | 104/435 [00:28<01:40, 3.30it/s]
|
|||
|
|
Processing model.layers.7.post_attention_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.7.post_attention_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 10.96it/s][A
|
|||
|
|
[A
Merging models: 24%|██▍ | 105/435 [00:28<01:28, 3.75it/s]
|
|||
|
|
Processing model.layers.4.mlp.gate_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.4.mlp.gate_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 3.82it/s][A
|
|||
|
|
Processing model.layers.4.mlp.gate_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.18it/s][A
|
|||
|
|
[A
Merging models: 24%|██▍ | 106/435 [00:29<01:49, 3.01it/s]
|
|||
|
|
Processing model.layers.18.input_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.18.input_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 11.13it/s][A
|
|||
|
|
[A
Merging models: 25%|██▍ | 107/435 [00:29<01:34, 3.49it/s]
|
|||
|
|
Processing model.layers.10.self_attn.o_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.10.self_attn.o_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 7.17it/s][A
|
|||
|
|
Processing model.layers.10.self_attn.o_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.26it/s][A
|
|||
|
|
[A
Merging models: 25%|██▍ | 108/435 [00:29<01:32, 3.53it/s]
|
|||
|
|
Processing model.layers.6.self_attn.v_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.6.self_attn.v_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 9.82it/s][A
|
|||
|
|
[A
Merging models: 25%|██▌ | 109/435 [00:30<01:24, 3.88it/s]
|
|||
|
|
Processing model.layers.2.self_attn.q_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.2.self_attn.q_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 7.08it/s][A
|
|||
|
|
Processing model.layers.2.self_attn.q_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.17it/s][A
|
|||
|
|
[A
Merging models: 25%|██▌ | 110/435 [00:30<01:25, 3.78it/s]
|
|||
|
|
Processing model.layers.22.self_attn.o_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.22.self_attn.o_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 8.29it/s][A
|
|||
|
|
Processing model.layers.22.self_attn.o_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.72it/s][A
|
|||
|
|
[A
Merging models: 26%|██▌ | 111/435 [00:30<01:24, 3.81it/s]
|
|||
|
|
Processing model.layers.36.mlp.gate_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.36.mlp.gate_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 4.83it/s][A
|
|||
|
|
Processing model.layers.36.mlp.gate_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.68it/s][A
|
|||
|
|
[A
Merging models: 26%|██▌ | 112/435 [00:31<01:40, 3.21it/s]
|
|||
|
|
Processing model.layers.7.self_attn.k_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.7.self_attn.k_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 9.82it/s][A
|
|||
|
|
[A
Merging models: 26%|██▌ | 113/435 [00:31<01:29, 3.61it/s]
|
|||
|
|
Processing model.layers.4.post_attention_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.4.post_attention_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 10.69it/s][A
|
|||
|
|
[A
Merging models: 26%|██▌ | 114/435 [00:31<01:20, 4.00it/s]
|
|||
|
|
Processing model.layers.31.self_attn.v_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.31.self_attn.v_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 10.35it/s][A
|
|||
|
|
[A
Merging models: 26%|██▋ | 115/435 [00:31<01:14, 4.29it/s]
|
|||
|
|
Processing model.layers.1.mlp.down_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.1.mlp.down_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 3.06it/s][A
|
|||
|
|
Processing model.layers.1.mlp.down_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 3.77it/s][A
|
|||
|
|
[A
Merging models: 27%|██▋ | 116/435 [00:32<01:44, 3.05it/s]
|
|||
|
|
Processing model.layers.35.mlp.gate_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.35.mlp.gate_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 5.03it/s][A
|
|||
|
|
Processing model.layers.35.mlp.gate_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.73it/s][A
|
|||
|
|
[A
Merging models: 27%|██▋ | 117/435 [00:32<01:53, 2.81it/s]
|
|||
|
|
Processing model.layers.29.mlp.gate_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.29.mlp.gate_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 5.00it/s][A
|
|||
|
|
Processing model.layers.29.mlp.gate_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.75it/s][A
|
|||
|
|
[A
Merging models: 27%|██▋ | 118/435 [00:33<01:58, 2.67it/s]
|
|||
|
|
Processing model.layers.20.self_attn.v_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.20.self_attn.v_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 10.62it/s][A
|
|||
|
|
[A
Merging models: 27%|██▋ | 119/435 [00:33<01:40, 3.14it/s]
|
|||
|
|
Processing model.layers.20.input_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.20.input_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 11.15it/s][A
|
|||
|
|
[A
Merging models: 28%|██▊ | 120/435 [00:33<01:27, 3.61it/s]
|
|||
|
|
Processing model.layers.22.mlp.down_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.22.mlp.down_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 5.00it/s][A
|
|||
|
|
Processing model.layers.22.mlp.down_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.73it/s][A
|
|||
|
|
[A
Merging models: 28%|██▊ | 121/435 [00:33<01:40, 3.13it/s]
|
|||
|
|
Processing model.layers.46.self_attn.v_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.46.self_attn.v_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 10.60it/s][A
|
|||
|
|
[A
Merging models: 28%|██▊ | 122/435 [00:33<01:27, 3.56it/s]
|
|||
|
|
Processing model.layers.23.self_attn.k_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.23.self_attn.k_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 10.55it/s][A
|
|||
|
|
[A
Merging models: 28%|██▊ | 123/435 [00:34<01:19, 3.95it/s]
|
|||
|
|
Processing model.layers.40.mlp.gate_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.40.mlp.gate_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 5.16it/s][A
|
|||
|
|
Processing model.layers.40.mlp.gate_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.82it/s][A
|
|||
|
|
[A
Merging models: 29%|██▊ | 124/435 [00:34<01:33, 3.32it/s]
|
|||
|
|
Processing model.layers.36.self_attn.o_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.36.self_attn.o_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 8.41it/s][A
|
|||
|
|
Processing model.layers.36.self_attn.o_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.73it/s][A
|
|||
|
|
[A
Merging models: 29%|██▊ | 125/435 [00:34<01:29, 3.48it/s]
|
|||
|
|
Processing model.layers.25.self_attn.k_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.25.self_attn.k_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 9.92it/s][A
|
|||
|
|
[A
Merging models: 29%|██▉ | 126/435 [00:35<01:20, 3.83it/s]
|
|||
|
|
Processing model.layers.19.mlp.gate_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.19.mlp.gate_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 5.15it/s][A
|
|||
|
|
Processing model.layers.19.mlp.gate_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.88it/s][A
|
|||
|
|
[A
Merging models: 29%|██▉ | 127/435 [00:35<01:33, 3.28it/s]
|
|||
|
|
Processing model.layers.6.input_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.6.input_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 10.90it/s][A
|
|||
|
|
[A
Merging models: 29%|██▉ | 128/435 [00:35<01:22, 3.72it/s]
|
|||
|
|
Processing model.layers.21.self_attn.q_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.21.self_attn.q_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 7.79it/s][A
|
|||
|
|
Processing model.layers.21.self_attn.q_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.49it/s][A
|
|||
|
|
[A
Merging models: 30%|██▉ | 129/435 [00:35<01:21, 3.73it/s]
|
|||
|
|
Processing model.layers.0.mlp.down_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.0.mlp.down_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 3.81it/s][A
|
|||
|
|
Processing model.layers.0.mlp.down_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.22it/s][A
|
|||
|
|
[A
Merging models: 30%|██▉ | 130/435 [00:36<01:41, 3.01it/s]
|
|||
|
|
Processing model.layers.37.self_attn.q_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.37.self_attn.q_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 8.29it/s][A
|
|||
|
|
Processing model.layers.37.self_attn.q_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.71it/s][A
|
|||
|
|
[A
Merging models: 30%|███ | 131/435 [00:36<01:34, 3.23it/s]
|
|||
|
|
Processing model.layers.47.self_attn.q_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.47.self_attn.q_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 7.73it/s][A
|
|||
|
|
Processing model.layers.47.self_attn.q_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.41it/s][A
|
|||
|
|
[A
Merging models: 30%|███ | 132/435 [00:36<01:30, 3.36it/s]
|
|||
|
|
Processing model.layers.8.self_attn.v_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.8.self_attn.v_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 9.80it/s][A
|
|||
|
|
[A
Merging models: 31%|███ | 133/435 [00:37<01:21, 3.73it/s]
|
|||
|
|
Processing model.layers.37.self_attn.k_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.37.self_attn.k_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 10.54it/s][A
|
|||
|
|
[A
Merging models: 31%|███ | 134/435 [00:37<01:13, 4.08it/s]
|
|||
|
|
Processing model.layers.42.post_attention_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.42.post_attention_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 11.12it/s][A
|
|||
|
|
[A
Merging models: 31%|███ | 135/435 [00:37<01:07, 4.43it/s]
|
|||
|
|
Processing model.layers.0.self_attn.o_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.0.self_attn.o_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 7.26it/s][A
|
|||
|
|
Processing model.layers.0.self_attn.o_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.31it/s][A
|
|||
|
|
[A
Merging models: 31%|███▏ | 136/435 [00:37<01:11, 4.16it/s]
|
|||
|
|
Processing model.layers.40.self_attn.k_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.40.self_attn.k_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 9.84it/s][A
|
|||
|
|
[A
Merging models: 31%|███▏ | 137/435 [00:37<01:08, 4.38it/s]
|
|||
|
|
Processing model.layers.38.self_attn.o_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.38.self_attn.o_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 8.39it/s][A
|
|||
|
|
Processing model.layers.38.self_attn.o_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.75it/s][A
|
|||
|
|
[A
Merging models: 32%|███▏ | 138/435 [00:38<01:10, 4.23it/s]
|
|||
|
|
Processing model.layers.43.mlp.up_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.43.mlp.up_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 5.08it/s][A
|
|||
|
|
Processing model.layers.43.mlp.up_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.84it/s][A
|
|||
|
|
[A
Merging models: 32%|███▏ | 139/435 [00:38<01:25, 3.46it/s]
|
|||
|
|
Processing model.layers.21.input_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.21.input_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 10.98it/s][A
|
|||
|
|
[A
Merging models: 32%|███▏ | 140/435 [00:38<01:15, 3.89it/s]
|
|||
|
|
Processing model.layers.5.self_attn.k_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.5.self_attn.k_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 9.72it/s][A
|
|||
|
|
[A
Merging models: 32%|███▏ | 141/435 [00:39<01:10, 4.17it/s]
|
|||
|
|
Processing model.layers.41.mlp.gate_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.41.mlp.gate_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 5.17it/s][A
|
|||
|
|
Processing model.layers.41.mlp.gate_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.84it/s][A
|
|||
|
|
[A
Merging models: 33%|███▎ | 142/435 [00:39<01:25, 3.44it/s]
|
|||
|
|
Processing model.layers.1.mlp.gate_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.1.mlp.gate_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 3.87it/s][A
|
|||
|
|
Processing model.layers.1.mlp.gate_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.25it/s][A
|
|||
|
|
[A
Merging models: 33%|███▎ | 143/435 [00:39<01:41, 2.88it/s]
|
|||
|
|
Processing model.layers.25.mlp.down_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.25.mlp.down_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 5.09it/s][A
|
|||
|
|
Processing model.layers.25.mlp.down_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.81it/s][A
|
|||
|
|
[A
Merging models: 33%|███▎ | 144/435 [00:40<01:46, 2.73it/s]
|
|||
|
|
Processing model.layers.26.input_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.26.input_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 10.66it/s][A
|
|||
|
|
[A
Merging models: 33%|███▎ | 145/435 [00:40<01:30, 3.19it/s]
|
|||
|
|
Processing model.layers.33.post_attention_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.33.post_attention_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 10.91it/s][A
|
|||
|
|
[A
Merging models: 34%|███▎ | 146/435 [00:40<01:19, 3.65it/s]
|
|||
|
|
Processing model.layers.44.mlp.gate_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.44.mlp.gate_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 5.14it/s][A
|
|||
|
|
Processing model.layers.44.mlp.gate_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.85it/s][A
|
|||
|
|
[A
Merging models: 34%|███▍ | 147/435 [00:41<01:30, 3.18it/s]
|
|||
|
|
Processing model.layers.31.self_attn.q_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.31.self_attn.q_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 8.36it/s][A
|
|||
|
|
Processing model.layers.31.self_attn.q_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.71it/s][A
|
|||
|
|
[A
Merging models: 34%|███▍ | 148/435 [00:41<01:25, 3.36it/s]
|
|||
|
|
Processing model.layers.11.post_attention_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.11.post_attention_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 10.92it/s][A
|
|||
|
|
[A
Merging models: 34%|███▍ | 149/435 [00:41<01:15, 3.80it/s]
|
|||
|
|
Processing model.layers.38.self_attn.k_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.38.self_attn.k_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 10.44it/s][A
|
|||
|
|
[A
Merging models: 34%|███▍ | 150/435 [00:41<01:08, 4.13it/s]
|
|||
|
|
Processing model.layers.42.mlp.up_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.42.mlp.up_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 5.06it/s][A
|
|||
|
|
Processing model.layers.42.mlp.up_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.77it/s][A
|
|||
|
|
[A
Merging models: 35%|███▍ | 151/435 [00:42<01:23, 3.40it/s]
|
|||
|
|
Processing model.layers.35.self_attn.o_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.35.self_attn.o_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 8.37it/s][A
|
|||
|
|
Processing model.layers.35.self_attn.o_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.70it/s][A
|
|||
|
|
[A
Merging models: 35%|███▍ | 152/435 [00:42<01:20, 3.53it/s]
|
|||
|
|
Processing model.layers.2.self_attn.o_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.2.self_attn.o_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 6.97it/s][A
|
|||
|
|
Processing model.layers.2.self_attn.o_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.15it/s][A
|
|||
|
|
[A
Merging models: 35%|███▌ | 153/435 [00:42<01:19, 3.54it/s]
|
|||
|
|
Processing model.layers.25.self_attn.v_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.25.self_attn.v_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 10.57it/s][A
|
|||
|
|
[A
Merging models: 35%|███▌ | 154/435 [00:42<01:11, 3.93it/s]
|
|||
|
|
Processing model.layers.45.mlp.gate_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.45.mlp.gate_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 5.10it/s][A
|
|||
|
|
Processing model.layers.45.mlp.gate_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.84it/s][A
|
|||
|
|
[A
Merging models: 36%|███▌ | 155/435 [00:43<01:24, 3.32it/s]
|
|||
|
|
Processing model.layers.13.mlp.gate_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.13.mlp.gate_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 3.92it/s][A
|
|||
|
|
Processing model.layers.13.mlp.gate_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.28it/s][A
|
|||
|
|
[A
Merging models: 36%|███▌ | 156/435 [00:43<01:38, 2.83it/s]
|
|||
|
|
Processing model.layers.47.input_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.47.input_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 11.20it/s][A
|
|||
|
|
[A
Merging models: 36%|███▌ | 157/435 [00:43<01:23, 3.32it/s]
|
|||
|
|
Processing model.layers.37.input_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.37.input_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 11.22it/s][A
|
|||
|
|
[A
Merging models: 36%|███▋ | 158/435 [00:44<01:13, 3.78it/s]
|
|||
|
|
Processing model.layers.45.self_attn.k_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.45.self_attn.k_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 10.51it/s][A
|
|||
|
|
[A
Merging models: 37%|███▋ | 159/435 [00:44<01:06, 4.13it/s]
|
|||
|
|
Processing model.layers.38.mlp.down_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.38.mlp.down_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 5.10it/s][A
|
|||
|
|
Processing model.layers.38.mlp.down_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.81it/s][A
|
|||
|
|
[A
Merging models: 37%|███▋ | 160/435 [00:44<01:20, 3.41it/s]
|
|||
|
|
Processing model.layers.4.mlp.up_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.4.mlp.up_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 3.91it/s][A
|
|||
|
|
Processing model.layers.4.mlp.up_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.15it/s][A
|
|||
|
|
[A
Merging models: 37%|███▋ | 161/435 [00:45<01:36, 2.85it/s]
|
|||
|
|
Processing model.layers.29.mlp.up_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.29.mlp.up_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 5.14it/s][A
|
|||
|
|
Processing model.layers.29.mlp.up_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.86it/s][A
|
|||
|
|
[A
Merging models: 37%|███▋ | 162/435 [00:45<01:40, 2.71it/s]
|
|||
|
|
Processing model.layers.20.post_attention_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.20.post_attention_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 10.84it/s][A
|
|||
|
|
[A
Merging models: 37%|███▋ | 163/435 [00:45<01:25, 3.19it/s]
|
|||
|
|
Processing model.layers.16.mlp.gate_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.16.mlp.gate_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 5.16it/s][A
|
|||
|
|
Processing model.layers.16.mlp.gate_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.84it/s][A
|
|||
|
|
[A
Merging models: 38%|███▊ | 164/435 [00:46<01:32, 2.92it/s]
|
|||
|
|
Processing model.layers.23.mlp.up_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.23.mlp.up_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 5.15it/s][A
|
|||
|
|
Processing model.layers.23.mlp.up_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.82it/s][A
|
|||
|
|
[A
Merging models: 38%|███▊ | 165/435 [00:46<01:38, 2.76it/s]
|
|||
|
|
Processing model.layers.6.mlp.gate_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.6.mlp.gate_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 3.92it/s][A
|
|||
|
|
Processing model.layers.6.mlp.gate_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.35it/s][A
|
|||
|
|
[A
Merging models: 38%|███▊ | 166/435 [00:47<01:46, 2.54it/s]
|
|||
|
|
Processing model.layers.29.input_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.29.input_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 11.23it/s][A
|
|||
|
|
[A
Merging models: 38%|███▊ | 167/435 [00:47<01:28, 3.03it/s]
|
|||
|
|
Processing model.layers.46.self_attn.q_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.46.self_attn.q_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 8.37it/s][A
|
|||
|
|
Processing model.layers.46.self_attn.q_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.85it/s][A
|
|||
|
|
[A
Merging models: 39%|███▊ | 168/435 [00:47<01:21, 3.26it/s]
|
|||
|
|
Processing model.layers.42.input_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.42.input_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 11.12it/s][A
|
|||
|
|
[A
Merging models: 39%|███▉ | 169/435 [00:47<01:11, 3.72it/s]
|
|||
|
|
Processing model.layers.36.mlp.down_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.36.mlp.down_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 4.90it/s][A
|
|||
|
|
Processing model.layers.36.mlp.down_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.78it/s][A
|
|||
|
|
[A
Merging models: 39%|███▉ | 170/435 [00:48<01:23, 3.19it/s]
|
|||
|
|
Processing model.layers.36.self_attn.q_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.36.self_attn.q_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 8.45it/s][A
|
|||
|
|
Processing model.layers.36.self_attn.q_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.79it/s][A
|
|||
|
|
[A
Merging models: 39%|███▉ | 171/435 [00:48<01:18, 3.38it/s]
|
|||
|
|
Processing model.layers.12.input_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.12.input_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 11.03it/s][A
|
|||
|
|
[A
Merging models: 40%|███▉ | 172/435 [00:48<01:08, 3.82it/s]
|
|||
|
|
Processing model.layers.12.self_attn.k_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.12.self_attn.k_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 9.94it/s][A
|
|||
|
|
[A
Merging models: 40%|███▉ | 173/435 [00:48<01:03, 4.13it/s]
|
|||
|
|
Processing model.layers.18.mlp.down_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.18.mlp.down_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 5.08it/s][A
|
|||
|
|
Processing model.layers.18.mlp.down_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.83it/s][A
|
|||
|
|
[A
Merging models: 40%|████ | 174/435 [00:49<01:16, 3.41it/s]
|
|||
|
|
Processing model.layers.26.post_attention_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.26.post_attention_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 11.23it/s][A
|
|||
|
|
[A
Merging models: 40%|████ | 175/435 [00:49<01:07, 3.87it/s]
|
|||
|
|
Processing model.embed_tokens.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.embed_tokens.weight: 50%|█████ | 1/2 [00:00<00:00, 2.30it/s][A
|
|||
|
|
Processing model.embed_tokens.weight: 100%|██████████| 2/2 [00:00<00:00, 2.57it/s][A
|
|||
|
|
[A
Merging models: 40%|████ | 176/435 [00:50<01:48, 2.39it/s]
|
|||
|
|
Processing model.layers.13.self_attn.v_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.13.self_attn.v_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 9.88it/s][A
|
|||
|
|
[A
Merging models: 41%|████ | 177/435 [00:50<01:30, 2.84it/s]
|
|||
|
|
Processing model.layers.10.mlp.down_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.10.mlp.down_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 4.00it/s][A
|
|||
|
|
Processing model.layers.10.mlp.down_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.36it/s][A
|
|||
|
|
[A
Merging models: 41%|████ | 178/435 [00:50<01:39, 2.59it/s]
|
|||
|
|
Processing model.layers.46.self_attn.o_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.46.self_attn.o_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 8.38it/s][A
|
|||
|
|
Processing model.layers.46.self_attn.o_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.91it/s][A
|
|||
|
|
[A
Merging models: 41%|████ | 179/435 [00:51<01:28, 2.89it/s]
|
|||
|
|
Processing model.layers.41.input_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.41.input_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 11.15it/s][A
|
|||
|
|
[A
Merging models: 41%|████▏ | 180/435 [00:51<01:15, 3.38it/s]
|
|||
|
|
Processing model.layers.42.self_attn.v_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.42.self_attn.v_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 10.69it/s][A
|
|||
|
|
[A
Merging models: 42%|████▏ | 181/435 [00:51<01:06, 3.80it/s]
|
|||
|
|
Processing model.layers.32.post_attention_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.32.post_attention_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 11.12it/s][A
|
|||
|
|
[A
Merging models: 42%|████▏ | 182/435 [00:51<01:00, 4.19it/s]
|
|||
|
|
Processing model.layers.0.self_attn.q_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.0.self_attn.q_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 6.96it/s][A
|
|||
|
|
Processing model.layers.0.self_attn.q_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.27it/s][A
|
|||
|
|
[A
Merging models: 42%|████▏ | 183/435 [00:51<01:03, 4.00it/s]
|
|||
|
|
Processing model.layers.33.self_attn.k_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.33.self_attn.k_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 10.58it/s][A
|
|||
|
|
[A
Merging models: 42%|████▏ | 184/435 [00:52<00:58, 4.31it/s]
|
|||
|
|
Processing model.layers.29.mlp.down_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.29.mlp.down_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 5.12it/s][A
|
|||
|
|
Processing model.layers.29.mlp.down_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.85it/s][A
|
|||
|
|
[A
Merging models: 43%|████▎ | 185/435 [00:52<01:11, 3.51it/s]
|
|||
|
|
Processing model.layers.30.mlp.down_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.30.mlp.down_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 5.10it/s][A
|
|||
|
|
Processing model.layers.30.mlp.down_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.80it/s][A
|
|||
|
|
[A
Merging models: 43%|████▎ | 186/435 [00:52<01:20, 3.09it/s]
|
|||
|
|
Processing model.layers.33.mlp.down_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.33.mlp.down_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 5.07it/s][A
|
|||
|
|
Processing model.layers.33.mlp.down_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.81it/s][A
|
|||
|
|
[A
Merging models: 43%|████▎ | 187/435 [00:53<01:26, 2.85it/s]
|
|||
|
|
Processing model.layers.12.mlp.gate_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.12.mlp.gate_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 3.87it/s][A
|
|||
|
|
Processing model.layers.12.mlp.gate_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.28it/s][A
|
|||
|
|
[A
Merging models: 43%|████▎ | 188/435 [00:53<01:35, 2.58it/s]
|
|||
|
|
Processing model.layers.27.post_attention_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.27.post_attention_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 11.11it/s][A
|
|||
|
|
[A
Merging models: 43%|████▎ | 189/435 [00:53<01:20, 3.07it/s]
|
|||
|
|
Processing model.layers.34.self_attn.q_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.34.self_attn.q_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 8.35it/s][A
|
|||
|
|
Processing model.layers.34.self_attn.q_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.75it/s][A
|
|||
|
|
[A
Merging models: 44%|████▎ | 190/435 [00:54<01:14, 3.28it/s]
|
|||
|
|
Processing model.layers.22.mlp.gate_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.22.mlp.gate_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 4.96it/s][A
|
|||
|
|
Processing model.layers.22.mlp.gate_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.77it/s][A
|
|||
|
|
[A
Merging models: 44%|████▍ | 191/435 [00:54<01:22, 2.95it/s]
|
|||
|
|
Processing model.layers.28.mlp.gate_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.28.mlp.gate_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 5.08it/s][A
|
|||
|
|
Processing model.layers.28.mlp.gate_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.81it/s][A
|
|||
|
|
[A
Merging models: 44%|████▍ | 192/435 [00:55<01:27, 2.77it/s]
|
|||
|
|
Processing model.layers.42.self_attn.q_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.42.self_attn.q_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 8.25it/s][A
|
|||
|
|
Processing model.layers.42.self_attn.q_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.77it/s][A
|
|||
|
|
[A
Merging models: 44%|████▍ | 193/435 [00:55<01:19, 3.04it/s]
|
|||
|
|
Processing model.layers.29.self_attn.v_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.29.self_attn.v_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 10.43it/s][A
|
|||
|
|
[A
Merging models: 45%|████▍ | 194/435 [00:55<01:09, 3.47it/s]
|
|||
|
|
Processing model.layers.47.post_attention_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.47.post_attention_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 11.12it/s][A
|
|||
|
|
[A
Merging models: 45%|████▍ | 195/435 [00:55<01:01, 3.91it/s]
|
|||
|
|
Processing model.layers.0.mlp.up_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.0.mlp.up_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 3.94it/s][A
|
|||
|
|
Processing model.layers.0.mlp.up_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.27it/s][A
|
|||
|
|
[A
Merging models: 45%|████▌ | 196/435 [00:56<01:16, 3.11it/s]
|
|||
|
|
Processing model.layers.27.mlp.gate_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.27.mlp.gate_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 5.11it/s][A
|
|||
|
|
Processing model.layers.27.mlp.gate_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.83it/s][A
|
|||
|
|
[A
Merging models: 45%|████▌ | 197/435 [00:56<01:22, 2.87it/s]
|
|||
|
|
Processing model.layers.9.self_attn.v_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.9.self_attn.v_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 9.66it/s][A
|
|||
|
|
[A
Merging models: 46%|████▌ | 198/435 [00:56<01:12, 3.29it/s]
|
|||
|
|
Processing model.layers.40.mlp.down_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.40.mlp.down_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 5.12it/s][A
|
|||
|
|
Processing model.layers.40.mlp.down_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.83it/s][A
|
|||
|
|
[A
Merging models: 46%|████▌ | 199/435 [00:57<01:19, 2.98it/s]
|
|||
|
|
Processing model.layers.17.self_attn.q_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.17.self_attn.q_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 8.46it/s][A
|
|||
|
|
Processing model.layers.17.self_attn.q_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.81it/s][A
|
|||
|
|
[A
Merging models: 46%|████▌ | 200/435 [00:57<01:13, 3.21it/s]
|
|||
|
|
Processing model.layers.22.self_attn.v_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.22.self_attn.v_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 10.51it/s][A
|
|||
|
|
[A
Merging models: 46%|████▌ | 201/435 [00:57<01:04, 3.63it/s]
|
|||
|
|
Processing model.layers.9.post_attention_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.9.post_attention_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 11.08it/s][A
|
|||
|
|
[A
Merging models: 46%|████▋ | 202/435 [00:57<00:57, 4.05it/s]
|
|||
|
|
Processing model.layers.38.mlp.gate_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.38.mlp.gate_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 5.00it/s][A
|
|||
|
|
Processing model.layers.38.mlp.gate_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.80it/s][A
|
|||
|
|
[A
Merging models: 47%|████▋ | 203/435 [00:58<01:08, 3.36it/s]
|
|||
|
|
Processing model.layers.16.mlp.up_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.16.mlp.up_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 5.10it/s][A
|
|||
|
|
Processing model.layers.16.mlp.up_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.84it/s][A
|
|||
|
|
[A
Merging models: 47%|████▋ | 204/435 [00:58<01:16, 3.02it/s]
|
|||
|
|
Processing model.layers.40.post_attention_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.40.post_attention_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 11.12it/s][A
|
|||
|
|
[A
Merging models: 47%|████▋ | 205/435 [00:58<01:05, 3.50it/s]
|
|||
|
|
Processing model.layers.45.self_attn.o_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.45.self_attn.o_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 8.40it/s][A
|
|||
|
|
Processing model.layers.45.self_attn.o_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.80it/s][A
|
|||
|
|
[A
Merging models: 47%|████▋ | 206/435 [00:59<01:03, 3.62it/s]
|
|||
|
|
Processing model.layers.35.self_attn.v_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.35.self_attn.v_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 10.49it/s][A
|
|||
|
|
[A
Merging models: 48%|████▊ | 207/435 [00:59<00:57, 3.99it/s]
|
|||
|
|
Processing model.layers.43.self_attn.o_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.43.self_attn.o_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 8.39it/s][A
|
|||
|
|
Processing model.layers.43.self_attn.o_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.76it/s][A
|
|||
|
|
[A
Merging models: 48%|████▊ | 208/435 [00:59<00:57, 3.97it/s]
|
|||
|
|
Processing model.layers.21.self_attn.v_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.21.self_attn.v_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 10.57it/s][A
|
|||
|
|
[A
Merging models: 48%|████▊ | 209/435 [00:59<00:52, 4.29it/s]
|
|||
|
|
Processing model.layers.47.mlp.down_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.47.mlp.down_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 5.05it/s][A
|
|||
|
|
Processing model.layers.47.mlp.down_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.81it/s][A
|
|||
|
|
[A
Merging models: 48%|████▊ | 210/435 [01:00<01:04, 3.48it/s]
|
|||
|
|
Processing model.layers.8.self_attn.q_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.8.self_attn.q_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 7.28it/s][A
|
|||
|
|
Processing model.layers.8.self_attn.q_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.31it/s][A
|
|||
|
|
[A
Merging models: 49%|████▊ | 211/435 [01:00<01:03, 3.53it/s]
|
|||
|
|
Processing model.layers.35.self_attn.q_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.35.self_attn.q_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 8.22it/s][A
|
|||
|
|
Processing model.layers.35.self_attn.q_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.71it/s][A
|
|||
|
|
[A
Merging models: 49%|████▊ | 212/435 [01:00<01:01, 3.63it/s]
|
|||
|
|
Processing model.layers.31.mlp.gate_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.31.mlp.gate_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 4.91it/s][A
|
|||
|
|
Processing model.layers.31.mlp.gate_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.76it/s][A
|
|||
|
|
[A
Merging models: 49%|████▉ | 213/435 [01:01<01:10, 3.14it/s]
|
|||
|
|
Processing model.layers.7.mlp.up_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.7.mlp.up_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 3.96it/s][A
|
|||
|
|
Processing model.layers.7.mlp.up_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.32it/s][A
|
|||
|
|
[A
Merging models: 49%|████▉ | 214/435 [01:01<01:20, 2.75it/s]
|
|||
|
|
Processing model.layers.15.mlp.down_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.15.mlp.down_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 5.04it/s][A
|
|||
|
|
Processing model.layers.15.mlp.down_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.78it/s][A
|
|||
|
|
[A
Merging models: 49%|████▉ | 215/435 [01:01<01:23, 2.64it/s]
|
|||
|
|
Processing model.layers.19.mlp.down_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.19.mlp.down_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 5.05it/s][A
|
|||
|
|
Processing model.layers.19.mlp.down_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.79it/s][A
|
|||
|
|
[A
Merging models: 50%|████▉ | 216/435 [01:02<01:25, 2.56it/s]
|
|||
|
|
Processing model.layers.20.self_attn.k_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.20.self_attn.k_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 10.48it/s][A
|
|||
|
|
[A
Merging models: 50%|████▉ | 217/435 [01:02<01:12, 3.03it/s]
|
|||
|
|
Processing model.layers.38.mlp.up_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.38.mlp.up_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 5.08it/s][A
|
|||
|
|
Processing model.layers.38.mlp.up_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.80it/s][A
|
|||
|
|
[A
Merging models: 50%|█████ | 218/435 [01:02<01:17, 2.81it/s]
|
|||
|
|
Processing model.layers.25.post_attention_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.25.post_attention_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 11.10it/s][A
|
|||
|
|
[A
Merging models: 50%|█████ | 219/435 [01:03<01:05, 3.30it/s]
|
|||
|
|
Processing model.layers.43.self_attn.k_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.43.self_attn.k_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 10.30it/s][A
|
|||
|
|
[A
Merging models: 51%|█████ | 220/435 [01:03<00:58, 3.70it/s]
|
|||
|
|
Processing model.layers.26.self_attn.k_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.26.self_attn.k_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 10.39it/s][A
|
|||
|
|
[A
Merging models: 51%|█████ | 221/435 [01:03<00:52, 4.05it/s]
|
|||
|
|
Processing model.layers.24.mlp.down_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.24.mlp.down_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 5.09it/s][A
|
|||
|
|
Processing model.layers.24.mlp.down_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.81it/s][A
|
|||
|
|
[A
Merging models: 51%|█████ | 222/435 [01:03<01:03, 3.37it/s]
|
|||
|
|
Processing model.layers.1.self_attn.o_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.1.self_attn.o_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 7.27it/s][A
|
|||
|
|
Processing model.layers.1.self_attn.o_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.47it/s][A
|
|||
|
|
[A
Merging models: 51%|█████▏ | 223/435 [01:04<01:01, 3.46it/s]
|
|||
|
|
Processing model.layers.24.self_attn.o_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.24.self_attn.o_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 7.84it/s][A
|
|||
|
|
Processing model.layers.24.self_attn.o_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.52it/s][A
|
|||
|
|
[A
Merging models: 51%|█████▏ | 224/435 [01:04<00:59, 3.55it/s]
|
|||
|
|
Processing model.layers.15.self_attn.q_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.15.self_attn.q_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 8.30it/s][A
|
|||
|
|
Processing model.layers.15.self_attn.q_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.69it/s][A
|
|||
|
|
[A
Merging models: 52%|█████▏ | 225/435 [01:04<00:57, 3.65it/s]
|
|||
|
|
Processing model.layers.2.mlp.up_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.2.mlp.up_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 3.89it/s][A
|
|||
|
|
Processing model.layers.2.mlp.up_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.28it/s][A
|
|||
|
|
[A
Merging models: 52%|█████▏ | 226/435 [01:05<01:09, 2.99it/s]
|
|||
|
|
Processing model.layers.10.self_attn.v_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.10.self_attn.v_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 9.83it/s][A
|
|||
|
|
[A
Merging models: 52%|█████▏ | 227/435 [01:05<01:01, 3.40it/s]
|
|||
|
|
Processing model.layers.11.input_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.11.input_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 10.83it/s][A
|
|||
|
|
[A
Merging models: 52%|█████▏ | 228/435 [01:05<00:54, 3.82it/s]
|
|||
|
|
Processing model.layers.40.input_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.40.input_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 10.91it/s][A
|
|||
|
|
[A
Merging models: 53%|█████▎ | 229/435 [01:05<00:49, 4.20it/s]
|
|||
|
|
Processing model.layers.31.self_attn.o_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.31.self_attn.o_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 8.28it/s][A
|
|||
|
|
Processing model.layers.31.self_attn.o_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.70it/s][A
|
|||
|
|
[A
Merging models: 53%|█████▎ | 230/435 [01:06<00:50, 4.10it/s]
|
|||
|
|
Processing model.layers.11.self_attn.v_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.11.self_attn.v_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 9.50it/s][A
|
|||
|
|
Processing model.layers.11.self_attn.v_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 9.78it/s][A
|
|||
|
|
[A
Merging models: 53%|█████▎ | 231/435 [01:06<00:47, 4.30it/s]
|
|||
|
|
Processing model.layers.32.self_attn.q_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.32.self_attn.q_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 8.35it/s][A
|
|||
|
|
Processing model.layers.32.self_attn.q_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.73it/s][A
|
|||
|
|
[A
Merging models: 53%|█████▎ | 232/435 [01:06<00:48, 4.17it/s]
|
|||
|
|
Processing model.layers.47.self_attn.k_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.47.self_attn.k_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 10.46it/s][A
|
|||
|
|
[A
Merging models: 54%|█████▎ | 233/435 [01:06<00:45, 4.44it/s]
|
|||
|
|
Processing model.layers.38.self_attn.v_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.38.self_attn.v_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 10.52it/s][A
|
|||
|
|
[A
Merging models: 54%|█████▍ | 234/435 [01:06<00:43, 4.65it/s]
|
|||
|
|
Processing model.layers.23.self_attn.v_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.23.self_attn.v_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 9.91it/s][A
|
|||
|
|
[A
Merging models: 54%|█████▍ | 235/435 [01:07<00:42, 4.74it/s]
|
|||
|
|
Processing model.layers.12.mlp.up_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.12.mlp.up_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 4.00it/s][A
|
|||
|
|
Processing model.layers.12.mlp.up_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.37it/s][A
|
|||
|
|
[A
Merging models: 54%|█████▍ | 236/435 [01:07<00:57, 3.48it/s]
|
|||
|
|
Processing model.layers.22.mlp.up_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.22.mlp.up_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 5.02it/s][A
|
|||
|
|
Processing model.layers.22.mlp.up_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.78it/s][A
|
|||
|
|
[A
Merging models: 54%|█████▍ | 237/435 [01:07<01:04, 3.07it/s]
|
|||
|
|
Processing model.layers.36.self_attn.v_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.36.self_attn.v_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 10.62it/s][A
|
|||
|
|
[A
Merging models: 55%|█████▍ | 238/435 [01:08<00:56, 3.51it/s]
|
|||
|
|
Processing model.layers.41.self_attn.v_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.41.self_attn.v_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 10.51it/s][A
|
|||
|
|
[A
Merging models: 55%|█████▍ | 239/435 [01:08<00:50, 3.90it/s]
|
|||
|
|
Processing model.layers.30.self_attn.v_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.30.self_attn.v_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 10.53it/s][A
|
|||
|
|
[A
Merging models: 55%|█████▌ | 240/435 [01:08<00:46, 4.22it/s]
|
|||
|
|
Processing model.layers.2.self_attn.k_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.2.self_attn.k_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 9.62it/s][A
|
|||
|
|
[A
Merging models: 55%|█████▌ | 241/435 [01:08<00:43, 4.43it/s]
|
|||
|
|
Processing model.layers.8.input_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.8.input_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 10.89it/s][A
|
|||
|
|
[A
Merging models: 56%|█████▌ | 242/435 [01:08<00:41, 4.69it/s]
|
|||
|
|
Processing model.layers.44.mlp.up_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.44.mlp.up_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 5.08it/s][A
|
|||
|
|
Processing model.layers.44.mlp.up_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.84it/s][A
|
|||
|
|
[A
Merging models: 56%|█████▌ | 243/435 [01:09<00:52, 3.67it/s]
|
|||
|
|
Processing model.layers.12.post_attention_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.12.post_attention_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 10.88it/s][A
|
|||
|
|
[A
Merging models: 56%|█████▌ | 244/435 [01:09<00:46, 4.07it/s]
|
|||
|
|
Processing model.layers.17.input_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.17.input_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 10.94it/s][A
|
|||
|
|
[A
Merging models: 56%|█████▋ | 245/435 [01:09<00:43, 4.40it/s]
|
|||
|
|
Processing model.layers.41.self_attn.o_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.41.self_attn.o_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 8.31it/s][A
|
|||
|
|
Processing model.layers.41.self_attn.o_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.80it/s][A
|
|||
|
|
[A
Merging models: 57%|█████▋ | 246/435 [01:09<00:44, 4.25it/s]
|
|||
|
|
Processing model.layers.8.mlp.up_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.8.mlp.up_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 3.92it/s][A
|
|||
|
|
Processing model.layers.8.mlp.up_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.30it/s][A
|
|||
|
|
[A
Merging models: 57%|█████▋ | 247/435 [01:10<00:57, 3.26it/s]
|
|||
|
|
Processing model.layers.39.self_attn.k_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.39.self_attn.k_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 10.40it/s][A
|
|||
|
|
[A
Merging models: 57%|█████▋ | 248/435 [01:10<00:50, 3.67it/s]
|
|||
|
|
Processing model.layers.21.post_attention_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.21.post_attention_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 9.00it/s][A
|
|||
|
|
[A
Merging models: 57%|█████▋ | 249/435 [01:10<00:47, 3.88it/s]
|
|||
|
|
Processing model.layers.24.input_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.24.input_layernorm.weight: 50%|█████ | 1/2 [00:00<00:00, 8.49it/s][A
|
|||
|
|
[A
Merging models: 57%|█████▋ | 250/435 [01:11<00:45, 4.07it/s]
|
|||
|
|
Processing model.layers.46.mlp.up_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.46.mlp.up_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 5.09it/s][A
|
|||
|
|
Processing model.layers.46.mlp.up_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.85it/s][A
|
|||
|
|
[A
Merging models: 58%|█████▊ | 251/435 [01:11<00:54, 3.39it/s]
|
|||
|
|
Processing model.layers.13.mlp.up_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.13.mlp.up_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 3.89it/s][A
|
|||
|
|
Processing model.layers.13.mlp.up_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.26it/s][A
|
|||
|
|
[A
Merging models: 58%|█████▊ | 252/435 [01:11<01:03, 2.86it/s]
|
|||
|
|
Processing model.layers.43.self_attn.v_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.43.self_attn.v_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 10.29it/s][A
|
|||
|
|
[A
Merging models: 58%|█████▊ | 253/435 [01:12<00:55, 3.30it/s]
|
|||
|
|
Processing model.layers.2.input_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.2.input_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 10.90it/s][A
|
|||
|
|
[A
Merging models: 58%|█████▊ | 254/435 [01:12<00:48, 3.74it/s]
|
|||
|
|
Processing model.layers.0.input_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.0.input_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 11.02it/s][A
|
|||
|
|
[A
Merging models: 59%|█████▊ | 255/435 [01:12<00:43, 4.14it/s]
|
|||
|
|
Processing lm_head.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing lm_head.weight: 50%|█████ | 1/2 [00:00<00:00, 3.16it/s][A
|
|||
|
|
Processing lm_head.weight: 100%|██████████| 2/2 [00:00<00:00, 2.94it/s][A
|
|||
|
|
[A
Merging models: 59%|█████▉ | 256/435 [01:13<01:06, 2.70it/s]
|
|||
|
|
Processing model.norm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.norm.weight: 100%|██████████| 2/2 [00:00<00:00, 11.19it/s][A
|
|||
|
|
[A
Merging models: 59%|█████▉ | 257/435 [01:13<00:55, 3.19it/s]
|
|||
|
|
Processing model.layers.45.mlp.up_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.45.mlp.up_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 5.12it/s][A
|
|||
|
|
Processing model.layers.45.mlp.up_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.84it/s][A
|
|||
|
|
[A
Merging models: 59%|█████▉ | 258/435 [01:13<01:00, 2.92it/s]
|
|||
|
|
Processing model.layers.9.mlp.gate_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.9.mlp.gate_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 3.91it/s][A
|
|||
|
|
Processing model.layers.9.mlp.gate_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.31it/s][A
|
|||
|
|
[A
Merging models: 60%|█████▉ | 259/435 [01:14<01:07, 2.62it/s]
|
|||
|
|
Processing model.layers.20.self_attn.q_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.20.self_attn.q_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 8.25it/s][A
|
|||
|
|
Processing model.layers.20.self_attn.q_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.74it/s][A
|
|||
|
|
[A
Merging models: 60%|█████▉ | 260/435 [01:14<01:00, 2.91it/s]
|
|||
|
|
Processing model.layers.44.self_attn.q_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.44.self_attn.q_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 8.15it/s][A
|
|||
|
|
Processing model.layers.44.self_attn.q_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.61it/s][A
|
|||
|
|
[A
Merging models: 60%|██████ | 261/435 [01:14<00:55, 3.14it/s]
|
|||
|
|
Processing model.layers.15.post_attention_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.15.post_attention_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 11.06it/s][A
|
|||
|
|
[A
Merging models: 60%|██████ | 262/435 [01:14<00:48, 3.60it/s]
|
|||
|
|
Processing model.layers.31.self_attn.k_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.31.self_attn.k_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 10.55it/s][A
|
|||
|
|
[A
Merging models: 60%|██████ | 263/435 [01:15<00:43, 3.98it/s]
|
|||
|
|
Processing model.layers.14.post_attention_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.14.post_attention_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 11.06it/s][A
|
|||
|
|
[A
Merging models: 61%|██████ | 264/435 [01:15<00:39, 4.34it/s]
|
|||
|
|
Processing model.layers.6.mlp.up_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.6.mlp.up_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 3.99it/s][A
|
|||
|
|
Processing model.layers.6.mlp.up_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.33it/s][A
|
|||
|
|
[A
Merging models: 61%|██████ | 265/435 [01:15<00:51, 3.31it/s]
|
|||
|
|
Processing model.layers.47.mlp.up_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.47.mlp.up_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 4.99it/s][A
|
|||
|
|
Processing model.layers.47.mlp.up_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.79it/s][A
|
|||
|
|
[A
Merging models: 61%|██████ | 266/435 [01:16<00:56, 2.98it/s]
|
|||
|
|
Processing model.layers.47.mlp.gate_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.47.mlp.gate_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 5.00it/s][A
|
|||
|
|
Processing model.layers.47.mlp.gate_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.77it/s][A
|
|||
|
|
[A
Merging models: 61%|██████▏ | 267/435 [01:16<01:00, 2.78it/s]
|
|||
|
|
Processing model.layers.36.self_attn.k_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.36.self_attn.k_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 10.64it/s][A
|
|||
|
|
[A
Merging models: 62%|██████▏ | 268/435 [01:16<00:51, 3.24it/s]
|
|||
|
|
Processing model.layers.46.post_attention_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.46.post_attention_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 11.10it/s][A
|
|||
|
|
[A
Merging models: 62%|██████▏ | 269/435 [01:16<00:44, 3.70it/s]
|
|||
|
|
Processing model.layers.9.input_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.9.input_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 10.96it/s][A
|
|||
|
|
[A
Merging models: 62%|██████▏ | 270/435 [01:17<00:40, 4.10it/s]
|
|||
|
|
Processing model.layers.40.self_attn.o_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.40.self_attn.o_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 7.83it/s][A
|
|||
|
|
Processing model.layers.40.self_attn.o_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.59it/s][A
|
|||
|
|
[A
Merging models: 62%|██████▏ | 271/435 [01:17<00:40, 4.01it/s]
|
|||
|
|
Processing model.layers.27.self_attn.v_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.27.self_attn.v_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 10.47it/s][A
|
|||
|
|
[A
Merging models: 63%|██████▎ | 272/435 [01:17<00:37, 4.31it/s]
|
|||
|
|
Processing model.layers.42.mlp.down_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.42.mlp.down_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 5.09it/s][A
|
|||
|
|
Processing model.layers.42.mlp.down_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.85it/s][A
|
|||
|
|
[A
Merging models: 63%|██████▎ | 273/435 [01:18<00:46, 3.50it/s]
|
|||
|
|
Processing model.layers.38.input_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.38.input_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 11.14it/s][A
|
|||
|
|
[A
Merging models: 63%|██████▎ | 274/435 [01:18<00:40, 3.94it/s]
|
|||
|
|
Processing model.layers.10.self_attn.k_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.10.self_attn.k_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 9.83it/s][A
|
|||
|
|
[A
Merging models: 63%|██████▎ | 275/435 [01:18<00:37, 4.22it/s]
|
|||
|
|
Processing model.layers.34.mlp.gate_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.34.mlp.gate_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 5.08it/s][A
|
|||
|
|
Processing model.layers.34.mlp.gate_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.83it/s][A
|
|||
|
|
[A
Merging models: 63%|██████▎ | 276/435 [01:18<00:45, 3.46it/s]
|
|||
|
|
Processing model.layers.11.self_attn.q_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.11.self_attn.q_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 7.29it/s][A
|
|||
|
|
Processing model.layers.11.self_attn.q_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.34it/s][A
|
|||
|
|
[A
Merging models: 64%|██████▎ | 277/435 [01:19<00:44, 3.52it/s]
|
|||
|
|
Processing model.layers.11.mlp.gate_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.11.mlp.gate_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 3.91it/s][A
|
|||
|
|
Processing model.layers.11.mlp.gate_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.31it/s][A
|
|||
|
|
[A
Merging models: 64%|██████▍ | 278/435 [01:19<00:53, 2.94it/s]
|
|||
|
|
Processing model.layers.15.self_attn.o_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.15.self_attn.o_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 8.35it/s][A
|
|||
|
|
Processing model.layers.15.self_attn.o_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.77it/s][A
|
|||
|
|
[A
Merging models: 64%|██████▍ | 279/435 [01:19<00:49, 3.17it/s]
|
|||
|
|
Processing model.layers.45.input_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.45.input_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 11.15it/s][A
|
|||
|
|
[A
Merging models: 64%|██████▍ | 280/435 [01:19<00:42, 3.64it/s]
|
|||
|
|
Processing model.layers.15.self_attn.v_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.15.self_attn.v_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 10.44it/s][A
|
|||
|
|
[A
Merging models: 65%|██████▍ | 281/435 [01:20<00:38, 4.00it/s]
|
|||
|
|
Processing model.layers.31.input_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.31.input_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 10.91it/s][A
|
|||
|
|
[A
Merging models: 65%|██████▍ | 282/435 [01:20<00:35, 4.35it/s]
|
|||
|
|
Processing model.layers.7.self_attn.q_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.7.self_attn.q_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 7.22it/s][A
|
|||
|
|
Processing model.layers.7.self_attn.q_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.37it/s][A
|
|||
|
|
[A
Merging models: 65%|██████▌ | 283/435 [01:20<00:36, 4.12it/s]
|
|||
|
|
Processing model.layers.5.self_attn.o_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.5.self_attn.o_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 7.08it/s][A
|
|||
|
|
Processing model.layers.5.self_attn.o_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.22it/s][A
|
|||
|
|
[A
Merging models: 65%|██████▌ | 284/435 [01:20<00:38, 3.95it/s]
|
|||
|
|
Processing model.layers.28.post_attention_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.28.post_attention_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 11.13it/s][A
|
|||
|
|
[A
Merging models: 66%|██████▌ | 285/435 [01:21<00:34, 4.32it/s]
|
|||
|
|
Processing model.layers.20.mlp.down_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.20.mlp.down_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 5.02it/s][A
|
|||
|
|
Processing model.layers.20.mlp.down_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.81it/s][A
|
|||
|
|
[A
Merging models: 66%|██████▌ | 286/435 [01:21<00:42, 3.49it/s]
|
|||
|
|
Processing model.layers.16.mlp.down_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.16.mlp.down_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 5.03it/s][A
|
|||
|
|
Processing model.layers.16.mlp.down_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.81it/s][A
|
|||
|
|
[A
Merging models: 66%|██████▌ | 287/435 [01:21<00:48, 3.08it/s]
|
|||
|
|
Processing model.layers.21.self_attn.o_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.21.self_attn.o_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 8.40it/s][A
|
|||
|
|
Processing model.layers.21.self_attn.o_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.79it/s][A
|
|||
|
|
[A
Merging models: 66%|██████▌ | 288/435 [01:22<00:44, 3.30it/s]
|
|||
|
|
Processing model.layers.2.mlp.gate_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.2.mlp.gate_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 3.92it/s][A
|
|||
|
|
Processing model.layers.2.mlp.gate_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.32it/s][A
|
|||
|
|
[A
Merging models: 66%|██████▋ | 289/435 [01:22<00:51, 2.83it/s]
|
|||
|
|
Processing model.layers.3.self_attn.k_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.3.self_attn.k_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 9.89it/s][A
|
|||
|
|
[A
Merging models: 67%|██████▋ | 290/435 [01:22<00:44, 3.26it/s]
|
|||
|
|
Processing model.layers.14.self_attn.k_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.14.self_attn.k_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 9.69it/s][A
|
|||
|
|
[A
Merging models: 67%|██████▋ | 291/435 [01:23<00:39, 3.63it/s]
|
|||
|
|
Processing model.layers.0.self_attn.v_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.0.self_attn.v_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 9.77it/s][A
|
|||
|
|
[A
Merging models: 67%|██████▋ | 292/435 [01:23<00:36, 3.96it/s]
|
|||
|
|
Processing model.layers.24.post_attention_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.24.post_attention_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 11.08it/s][A
|
|||
|
|
[A
Merging models: 67%|██████▋ | 293/435 [01:23<00:32, 4.33it/s]
|
|||
|
|
Processing model.layers.37.self_attn.o_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.37.self_attn.o_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 7.96it/s][A
|
|||
|
|
Processing model.layers.37.self_attn.o_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.70it/s][A
|
|||
|
|
[A
Merging models: 68%|██████▊ | 294/435 [01:23<00:33, 4.18it/s]
|
|||
|
|
Processing model.layers.23.mlp.gate_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.23.mlp.gate_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 4.97it/s][A
|
|||
|
|
Processing model.layers.23.mlp.gate_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.74it/s][A
|
|||
|
|
[A
Merging models: 68%|██████▊ | 295/435 [01:24<00:41, 3.41it/s]
|
|||
|
|
Processing model.layers.40.mlp.up_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.40.mlp.up_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 5.08it/s][A
|
|||
|
|
Processing model.layers.40.mlp.up_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.83it/s][A
|
|||
|
|
[A
Merging models: 68%|██████▊ | 296/435 [01:24<00:45, 3.04it/s]
|
|||
|
|
Processing model.layers.36.mlp.up_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.36.mlp.up_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 5.02it/s][A
|
|||
|
|
Processing model.layers.36.mlp.up_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.73it/s][A
|
|||
|
|
[A
Merging models: 68%|██████▊ | 297/435 [01:24<00:49, 2.81it/s]
|
|||
|
|
Processing model.layers.39.self_attn.o_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.39.self_attn.o_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 8.03it/s][A
|
|||
|
|
Processing model.layers.39.self_attn.o_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.67it/s][A
|
|||
|
|
[A
Merging models: 69%|██████▊ | 298/435 [01:25<00:44, 3.06it/s]
|
|||
|
|
Processing model.layers.14.self_attn.v_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.14.self_attn.v_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 9.36it/s][A
|
|||
|
|
Processing model.layers.14.self_attn.v_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 9.70it/s][A
|
|||
|
|
[A
Merging models: 69%|██████▊ | 299/435 [01:25<00:39, 3.43it/s]
|
|||
|
|
Processing model.layers.6.self_attn.q_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.6.self_attn.q_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 6.44it/s][A
|
|||
|
|
Processing model.layers.6.self_attn.q_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 6.69it/s][A
|
|||
|
|
[A
Merging models: 69%|██████▉ | 300/435 [01:25<00:39, 3.40it/s]
|
|||
|
|
Processing model.layers.43.mlp.down_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.43.mlp.down_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 4.81it/s][A
|
|||
|
|
Processing model.layers.43.mlp.down_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.64it/s][A
|
|||
|
|
[A
Merging models: 69%|██████▉ | 301/435 [01:26<00:44, 2.99it/s]
|
|||
|
|
Processing model.layers.39.mlp.up_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.39.mlp.up_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 5.13it/s][A
|
|||
|
|
Processing model.layers.39.mlp.up_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.86it/s][A
|
|||
|
|
[A
Merging models: 69%|██████▉ | 302/435 [01:26<00:47, 2.80it/s]
|
|||
|
|
Processing model.layers.13.mlp.down_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.13.mlp.down_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 3.86it/s][A
|
|||
|
|
Processing model.layers.13.mlp.down_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.29it/s][A
|
|||
|
|
[A
Merging models: 70%|██████▉ | 303/435 [01:27<00:51, 2.55it/s]
|
|||
|
|
Processing model.layers.27.self_attn.o_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.27.self_attn.o_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 7.78it/s][A
|
|||
|
|
Processing model.layers.27.self_attn.o_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.51it/s][A
|
|||
|
|
[A
Merging models: 70%|██████▉ | 304/435 [01:27<00:46, 2.82it/s]
|
|||
|
|
Processing model.layers.29.self_attn.o_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.29.self_attn.o_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 8.03it/s][A
|
|||
|
|
Processing model.layers.29.self_attn.o_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.58it/s][A
|
|||
|
|
[A
Merging models: 70%|███████ | 305/435 [01:27<00:42, 3.06it/s]
|
|||
|
|
Processing model.layers.16.self_attn.q_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.16.self_attn.q_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 7.71it/s][A
|
|||
|
|
Processing model.layers.16.self_attn.q_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.44it/s][A
|
|||
|
|
[A
Merging models: 70%|███████ | 306/435 [01:27<00:39, 3.24it/s]
|
|||
|
|
Processing model.layers.32.self_attn.v_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.32.self_attn.v_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 10.07it/s][A
|
|||
|
|
[A
Merging models: 71%|███████ | 307/435 [01:28<00:35, 3.63it/s]
|
|||
|
|
Processing model.layers.27.self_attn.k_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.27.self_attn.k_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 9.92it/s][A
|
|||
|
|
Processing model.layers.27.self_attn.k_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 9.86it/s][A
|
|||
|
|
[A
Merging models: 71%|███████ | 308/435 [01:28<00:32, 3.94it/s]
|
|||
|
|
Processing model.layers.32.self_attn.o_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.32.self_attn.o_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 7.57it/s][A
|
|||
|
|
Processing model.layers.32.self_attn.o_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.53it/s][A
|
|||
|
|
[A
Merging models: 71%|███████ | 309/435 [01:28<00:32, 3.88it/s]
|
|||
|
|
Processing model.layers.14.self_attn.q_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.14.self_attn.q_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 6.92it/s][A
|
|||
|
|
Processing model.layers.14.self_attn.q_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.18it/s][A
|
|||
|
|
[A
Merging models: 71%|███████▏ | 310/435 [01:28<00:33, 3.78it/s]
|
|||
|
|
Processing model.layers.30.input_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.30.input_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 10.91it/s][A
|
|||
|
|
[A
Merging models: 71%|███████▏ | 311/435 [01:28<00:29, 4.16it/s]
|
|||
|
|
Processing model.layers.3.input_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.3.input_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 10.90it/s][A
|
|||
|
|
[A
Merging models: 72%|███████▏ | 312/435 [01:29<00:27, 4.48it/s]
|
|||
|
|
Processing model.layers.30.self_attn.o_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.30.self_attn.o_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 7.75it/s][A
|
|||
|
|
Processing model.layers.30.self_attn.o_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.55it/s][A
|
|||
|
|
[A
Merging models: 72%|███████▏ | 313/435 [01:29<00:28, 4.24it/s]
|
|||
|
|
Processing model.layers.23.mlp.down_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.23.mlp.down_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 5.01it/s][A
|
|||
|
|
Processing model.layers.23.mlp.down_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.77it/s][A
|
|||
|
|
[A
Merging models: 72%|███████▏ | 314/435 [01:29<00:35, 3.45it/s]
|
|||
|
|
Processing model.layers.37.mlp.gate_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.37.mlp.gate_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 5.03it/s][A
|
|||
|
|
Processing model.layers.37.mlp.gate_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.84it/s][A
|
|||
|
|
[A
Merging models: 72%|███████▏ | 315/435 [01:30<00:39, 3.06it/s]
|
|||
|
|
Processing model.layers.42.self_attn.o_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.42.self_attn.o_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 7.73it/s][A
|
|||
|
|
Processing model.layers.42.self_attn.o_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.45it/s][A
|
|||
|
|
[A
Merging models: 73%|███████▎ | 316/435 [01:30<00:36, 3.24it/s]
|
|||
|
|
Processing model.layers.43.input_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.43.input_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 11.02it/s][A
|
|||
|
|
[A
Merging models: 73%|███████▎ | 317/435 [01:30<00:31, 3.70it/s]
|
|||
|
|
Processing model.layers.40.self_attn.q_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.40.self_attn.q_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 7.76it/s][A
|
|||
|
|
Processing model.layers.40.self_attn.q_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.20it/s][A
|
|||
|
|
[A
Merging models: 73%|███████▎ | 318/435 [01:30<00:31, 3.68it/s]
|
|||
|
|
Processing model.layers.17.self_attn.o_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.17.self_attn.o_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 7.54it/s][A
|
|||
|
|
Processing model.layers.17.self_attn.o_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.46it/s][A
|
|||
|
|
[A
Merging models: 73%|███████▎ | 319/435 [01:31<00:31, 3.69it/s]
|
|||
|
|
Processing model.layers.8.mlp.gate_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.8.mlp.gate_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 3.90it/s][A
|
|||
|
|
Processing model.layers.8.mlp.gate_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.29it/s][A
|
|||
|
|
[A
Merging models: 74%|███████▎ | 320/435 [01:31<00:38, 3.02it/s]
|
|||
|
|
Processing model.layers.17.self_attn.v_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.17.self_attn.v_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 10.64it/s][A
|
|||
|
|
[A
Merging models: 74%|███████▍ | 321/435 [01:31<00:32, 3.46it/s]
|
|||
|
|
Processing model.layers.1.post_attention_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.1.post_attention_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 10.87it/s][A
|
|||
|
|
[A
Merging models: 74%|███████▍ | 322/435 [01:32<00:29, 3.89it/s]
|
|||
|
|
Processing model.layers.11.self_attn.k_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.11.self_attn.k_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 9.40it/s][A
|
|||
|
|
[A
Merging models: 74%|███████▍ | 323/435 [01:32<00:27, 4.14it/s]
|
|||
|
|
Processing model.layers.28.self_attn.k_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.28.self_attn.k_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 10.58it/s][A
|
|||
|
|
[A
Merging models: 74%|███████▍ | 324/435 [01:32<00:25, 4.42it/s]
|
|||
|
|
Processing model.layers.4.input_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.4.input_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 10.85it/s][A
|
|||
|
|
[A
Merging models: 75%|███████▍ | 325/435 [01:32<00:23, 4.68it/s]
|
|||
|
|
Processing model.layers.21.mlp.down_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.21.mlp.down_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 5.10it/s][A
|
|||
|
|
Processing model.layers.21.mlp.down_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.81it/s][A
|
|||
|
|
[A
Merging models: 75%|███████▍ | 326/435 [01:33<00:29, 3.66it/s]
|
|||
|
|
Processing model.layers.39.self_attn.q_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.39.self_attn.q_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 8.22it/s][A
|
|||
|
|
Processing model.layers.39.self_attn.q_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.65it/s][A
|
|||
|
|
[A
Merging models: 75%|███████▌ | 327/435 [01:33<00:29, 3.72it/s]
|
|||
|
|
Processing model.layers.30.self_attn.k_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.30.self_attn.k_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 10.51it/s][A
|
|||
|
|
[A
Merging models: 75%|███████▌ | 328/435 [01:33<00:26, 4.07it/s]
|
|||
|
|
Processing model.layers.3.post_attention_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.3.post_attention_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 10.87it/s][A
|
|||
|
|
[A
Merging models: 76%|███████▌ | 329/435 [01:33<00:24, 4.40it/s]
|
|||
|
|
Processing model.layers.19.input_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.19.input_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 10.87it/s][A
|
|||
|
|
[A
Merging models: 76%|███████▌ | 330/435 [01:33<00:22, 4.67it/s]
|
|||
|
|
Processing model.layers.33.self_attn.q_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.33.self_attn.q_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 8.34it/s][A
|
|||
|
|
Processing model.layers.33.self_attn.q_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.67it/s][A
|
|||
|
|
[A
Merging models: 76%|███████▌ | 331/435 [01:34<00:23, 4.40it/s]
|
|||
|
|
Processing model.layers.9.self_attn.o_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.9.self_attn.o_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 7.03it/s][A
|
|||
|
|
Processing model.layers.9.self_attn.o_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.26it/s][A
|
|||
|
|
[A
Merging models: 76%|███████▋ | 332/435 [01:34<00:24, 4.13it/s]
|
|||
|
|
Processing model.layers.9.mlp.down_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.9.mlp.down_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 3.94it/s][A
|
|||
|
|
Processing model.layers.9.mlp.down_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.33it/s][A
|
|||
|
|
[A
Merging models: 77%|███████▋ | 333/435 [01:34<00:31, 3.22it/s]
|
|||
|
|
Processing model.layers.36.input_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.36.input_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 10.98it/s][A
|
|||
|
|
[A
Merging models: 77%|███████▋ | 334/435 [01:35<00:27, 3.68it/s]
|
|||
|
|
Processing model.layers.30.mlp.gate_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.30.mlp.gate_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 5.05it/s][A
|
|||
|
|
Processing model.layers.30.mlp.gate_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.81it/s][A
|
|||
|
|
[A
Merging models: 77%|███████▋ | 335/435 [01:35<00:31, 3.18it/s]
|
|||
|
|
Processing model.layers.6.self_attn.o_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.6.self_attn.o_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 7.28it/s][A
|
|||
|
|
Processing model.layers.6.self_attn.o_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.36it/s][A
|
|||
|
|
[A
Merging models: 77%|███████▋ | 336/435 [01:35<00:29, 3.31it/s]
|
|||
|
|
Processing model.layers.39.mlp.down_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.39.mlp.down_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 4.93it/s][A
|
|||
|
|
Processing model.layers.39.mlp.down_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.73it/s][A
|
|||
|
|
[A
Merging models: 77%|███████▋ | 337/435 [01:36<00:33, 2.96it/s]
|
|||
|
|
Processing model.layers.37.mlp.up_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.37.mlp.up_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 5.05it/s][A
|
|||
|
|
Processing model.layers.37.mlp.up_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.79it/s][A
|
|||
|
|
[A
Merging models: 78%|███████▊ | 338/435 [01:36<00:34, 2.77it/s]
|
|||
|
|
Processing model.layers.27.mlp.down_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.27.mlp.down_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 4.88it/s][A
|
|||
|
|
Processing model.layers.27.mlp.down_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.69it/s][A
|
|||
|
|
[A
Merging models: 78%|███████▊ | 339/435 [01:36<00:36, 2.63it/s]
|
|||
|
|
Processing model.layers.17.mlp.gate_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.17.mlp.gate_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 5.02it/s][A
|
|||
|
|
Processing model.layers.17.mlp.gate_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.76it/s][A
|
|||
|
|
[A
Merging models: 78%|███████▊ | 340/435 [01:37<00:37, 2.56it/s]
|
|||
|
|
Processing model.layers.35.mlp.up_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.35.mlp.up_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 5.03it/s][A
|
|||
|
|
Processing model.layers.35.mlp.up_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.72it/s][A
|
|||
|
|
[A
Merging models: 78%|███████▊ | 341/435 [01:37<00:37, 2.50it/s]
|
|||
|
|
Processing model.layers.35.input_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.35.input_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 11.13it/s][A
|
|||
|
|
[A
Merging models: 79%|███████▊ | 342/435 [01:38<00:31, 2.99it/s]
|
|||
|
|
Processing model.layers.26.mlp.up_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.26.mlp.up_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 5.04it/s][A
|
|||
|
|
Processing model.layers.26.mlp.up_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.78it/s][A
|
|||
|
|
[A
Merging models: 79%|███████▉ | 343/435 [01:38<00:32, 2.79it/s]
|
|||
|
|
Processing model.layers.16.post_attention_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.16.post_attention_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 10.98it/s][A
|
|||
|
|
[A
Merging models: 79%|███████▉ | 344/435 [01:38<00:27, 3.27it/s]
|
|||
|
|
Processing model.layers.34.self_attn.v_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.34.self_attn.v_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 10.71it/s][A
|
|||
|
|
[A
Merging models: 79%|███████▉ | 345/435 [01:38<00:24, 3.70it/s]
|
|||
|
|
Processing model.layers.10.mlp.gate_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.10.mlp.gate_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 2.90it/s][A
|
|||
|
|
Processing model.layers.10.mlp.gate_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 3.73it/s][A
|
|||
|
|
[A
Merging models: 80%|███████▉ | 346/435 [01:39<00:31, 2.80it/s]
|
|||
|
|
Processing model.layers.23.self_attn.o_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.23.self_attn.o_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 8.10it/s][A
|
|||
|
|
Processing model.layers.23.self_attn.o_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.56it/s][A
|
|||
|
|
[A
Merging models: 80%|███████▉ | 347/435 [01:39<00:28, 3.04it/s]
|
|||
|
|
Processing model.layers.3.mlp.gate_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.3.mlp.gate_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 3.75it/s][A
|
|||
|
|
Processing model.layers.3.mlp.gate_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 3.08it/s][A
|
|||
|
|
[A
Merging models: 80%|████████ | 348/435 [01:40<00:36, 2.38it/s]
|
|||
|
|
Processing model.layers.24.mlp.up_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.24.mlp.up_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 4.82it/s][A
|
|||
|
|
Processing model.layers.24.mlp.up_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.56it/s][A
|
|||
|
|
[A
Merging models: 80%|████████ | 349/435 [01:40<00:36, 2.36it/s]
|
|||
|
|
Processing model.layers.14.input_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.14.input_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 10.71it/s][A
|
|||
|
|
[A
Merging models: 80%|████████ | 350/435 [01:40<00:30, 2.83it/s]
|
|||
|
|
Processing model.layers.23.post_attention_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.23.post_attention_layernorm.weight: 50%|█████ | 1/2 [00:00<00:00, 9.72it/s][A
|
|||
|
|
[A
Merging models: 81%|████████ | 351/435 [01:41<00:25, 3.26it/s]
|
|||
|
|
Processing model.layers.42.mlp.gate_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.42.mlp.gate_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 3.95it/s][A
|
|||
|
|
Processing model.layers.42.mlp.gate_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.30it/s][A
|
|||
|
|
[A
Merging models: 81%|████████ | 352/435 [01:41<00:29, 2.81it/s]
|
|||
|
|
Processing model.layers.10.post_attention_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.10.post_attention_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 10.99it/s][A
|
|||
|
|
[A
Merging models: 81%|████████ | 353/435 [01:41<00:24, 3.29it/s]
|
|||
|
|
Processing model.layers.11.self_attn.o_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.11.self_attn.o_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 7.09it/s][A
|
|||
|
|
Processing model.layers.11.self_attn.o_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.18it/s][A
|
|||
|
|
[A
Merging models: 81%|████████▏ | 354/435 [01:42<00:24, 3.37it/s]
|
|||
|
|
Processing model.layers.7.input_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.7.input_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 11.04it/s][A
|
|||
|
|
[A
Merging models: 82%|████████▏ | 355/435 [01:42<00:20, 3.81it/s]
|
|||
|
|
Processing model.layers.39.mlp.gate_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.39.mlp.gate_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 4.84it/s][A
|
|||
|
|
Processing model.layers.39.mlp.gate_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 3.48it/s][A
|
|||
|
|
[A
Merging models: 82%|████████▏ | 356/435 [01:42<00:27, 2.87it/s]
|
|||
|
|
Processing model.layers.36.post_attention_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.36.post_attention_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 11.19it/s][A
|
|||
|
|
[A
Merging models: 82%|████████▏ | 357/435 [01:42<00:23, 3.36it/s]
|
|||
|
|
Processing model.layers.14.mlp.up_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.14.mlp.up_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 3.95it/s][A
|
|||
|
|
Processing model.layers.14.mlp.up_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.29it/s][A
|
|||
|
|
[A
Merging models: 82%|████████▏ | 358/435 [01:43<00:26, 2.86it/s]
|
|||
|
|
Processing model.layers.5.self_attn.v_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.5.self_attn.v_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 9.80it/s][A
|
|||
|
|
[A
Merging models: 83%|████████▎ | 359/435 [01:43<00:23, 3.28it/s]
|
|||
|
|
Processing model.layers.14.mlp.gate_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.14.mlp.gate_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 2.86it/s][A
|
|||
|
|
Processing model.layers.14.mlp.gate_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 3.67it/s][A
|
|||
|
|
[A
Merging models: 83%|████████▎ | 360/435 [01:44<00:28, 2.60it/s]
|
|||
|
|
Processing model.layers.0.post_attention_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.0.post_attention_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 11.02it/s][A
|
|||
|
|
[A
Merging models: 83%|████████▎ | 361/435 [01:44<00:23, 3.09it/s]
|
|||
|
|
Processing model.layers.17.mlp.up_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.17.mlp.up_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 4.47it/s][A
|
|||
|
|
Processing model.layers.17.mlp.up_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.60it/s][A
|
|||
|
|
[A
Merging models: 83%|████████▎ | 362/435 [01:44<00:26, 2.80it/s]
|
|||
|
|
Processing model.layers.25.mlp.up_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.25.mlp.up_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 4.88it/s][A
|
|||
|
|
Processing model.layers.25.mlp.up_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 3.66it/s][A
|
|||
|
|
[A
Merging models: 83%|████████▎ | 363/435 [01:45<00:29, 2.45it/s]
|
|||
|
|
Processing model.layers.5.mlp.up_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.5.mlp.up_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 3.91it/s][A
|
|||
|
|
Processing model.layers.5.mlp.up_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.26it/s][A
|
|||
|
|
[A
Merging models: 84%|████████▎ | 364/435 [01:45<00:30, 2.33it/s]
|
|||
|
|
Processing model.layers.34.mlp.up_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.34.mlp.up_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 4.97it/s][A
|
|||
|
|
Processing model.layers.34.mlp.up_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 3.56it/s][A
|
|||
|
|
[A
Merging models: 84%|████████▍ | 365/435 [01:46<00:32, 2.17it/s]
|
|||
|
|
Processing model.layers.39.self_attn.v_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.39.self_attn.v_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 10.45it/s][A
|
|||
|
|
[A
Merging models: 84%|████████▍ | 366/435 [01:46<00:26, 2.63it/s]
|
|||
|
|
Processing model.layers.47.self_attn.o_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.47.self_attn.o_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 7.58it/s][A
|
|||
|
|
Processing model.layers.47.self_attn.o_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.35it/s][A
|
|||
|
|
[A
Merging models: 84%|████████▍ | 367/435 [01:46<00:23, 2.88it/s]
|
|||
|
|
Processing model.layers.18.mlp.gate_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.18.mlp.gate_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 3.77it/s][A
|
|||
|
|
Processing model.layers.18.mlp.gate_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.24it/s][A
|
|||
|
|
[A
Merging models: 85%|████████▍ | 368/435 [01:47<00:25, 2.58it/s]
|
|||
|
|
Processing model.layers.44.self_attn.o_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.44.self_attn.o_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 8.15it/s][A
|
|||
|
|
Processing model.layers.44.self_attn.o_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.52it/s][A
|
|||
|
|
[A
Merging models: 85%|████████▍ | 369/435 [01:47<00:23, 2.85it/s]
|
|||
|
|
Processing model.layers.22.self_attn.q_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.22.self_attn.q_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 8.14it/s][A
|
|||
|
|
Processing model.layers.22.self_attn.q_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.54it/s][A
|
|||
|
|
[A
Merging models: 85%|████████▌ | 370/435 [01:47<00:21, 3.08it/s]
|
|||
|
|
Processing model.layers.10.input_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.10.input_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 10.96it/s][A
|
|||
|
|
[A
Merging models: 85%|████████▌ | 371/435 [01:47<00:18, 3.55it/s]
|
|||
|
|
Processing model.layers.44.input_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.44.input_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 11.17it/s][A
|
|||
|
|
[A
Merging models: 86%|████████▌ | 372/435 [01:48<00:15, 3.98it/s]
|
|||
|
|
Processing model.layers.19.mlp.up_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.19.mlp.up_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 3.38it/s][A
|
|||
|
|
Processing model.layers.19.mlp.up_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 3.98it/s][A
|
|||
|
|
[A
Merging models: 86%|████████▌ | 373/435 [01:48<00:20, 3.02it/s]
|
|||
|
|
Processing model.layers.21.mlp.gate_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.21.mlp.gate_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 4.88it/s][A
|
|||
|
|
Processing model.layers.21.mlp.gate_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.62it/s][A
|
|||
|
|
[A
Merging models: 86%|████████▌ | 374/435 [01:49<00:21, 2.77it/s]
|
|||
|
|
Processing model.layers.19.self_attn.q_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.19.self_attn.q_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 6.87it/s][A
|
|||
|
|
Processing model.layers.19.self_attn.q_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.02it/s][A
|
|||
|
|
[A
Merging models: 86%|████████▌ | 375/435 [01:49<00:20, 2.96it/s]
|
|||
|
|
Processing model.layers.1.self_attn.k_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.1.self_attn.k_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 9.68it/s][A
|
|||
|
|
[A
Merging models: 86%|████████▋ | 376/435 [01:49<00:17, 3.36it/s]
|
|||
|
|
Processing model.layers.9.self_attn.k_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.9.self_attn.k_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 9.46it/s][A
|
|||
|
|
[A
Merging models: 87%|████████▋ | 377/435 [01:49<00:15, 3.71it/s]
|
|||
|
|
Processing model.layers.24.self_attn.k_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.24.self_attn.k_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 9.98it/s][A
|
|||
|
|
[A
Merging models: 87%|████████▋ | 378/435 [01:49<00:14, 4.02it/s]
|
|||
|
|
Processing model.layers.8.mlp.down_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.8.mlp.down_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 2.85it/s][A
|
|||
|
|
Processing model.layers.8.mlp.down_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 3.73it/s][A
|
|||
|
|
[A
Merging models: 87%|████████▋ | 379/435 [01:50<00:19, 2.92it/s]
|
|||
|
|
Processing model.layers.8.post_attention_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.8.post_attention_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 10.98it/s][A
|
|||
|
|
[A
Merging models: 87%|████████▋ | 380/435 [01:50<00:16, 3.39it/s]
|
|||
|
|
Processing model.layers.28.self_attn.o_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.28.self_attn.o_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 7.72it/s][A
|
|||
|
|
Processing model.layers.28.self_attn.o_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.40it/s][A
|
|||
|
|
[A
Merging models: 88%|████████▊ | 381/435 [01:51<00:15, 3.48it/s]
|
|||
|
|
Processing model.layers.1.self_attn.v_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.1.self_attn.v_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 9.87it/s][A
|
|||
|
|
[A
Merging models: 88%|████████▊ | 382/435 [01:51<00:13, 3.83it/s]
|
|||
|
|
Processing model.layers.37.self_attn.v_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.37.self_attn.v_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 10.23it/s][A
|
|||
|
|
[A
Merging models: 88%|████████▊ | 383/435 [01:51<00:12, 4.14it/s]
|
|||
|
|
Processing model.layers.13.self_attn.o_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.13.self_attn.o_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 7.31it/s][A
|
|||
|
|
Processing model.layers.13.self_attn.o_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.33it/s][A
|
|||
|
|
[A
Merging models: 88%|████████▊ | 384/435 [01:51<00:12, 3.98it/s]
|
|||
|
|
Processing model.layers.9.self_attn.q_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.9.self_attn.q_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 7.04it/s][A
|
|||
|
|
Processing model.layers.9.self_attn.q_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.19it/s][A
|
|||
|
|
[A
Merging models: 89%|████████▊ | 385/435 [01:51<00:12, 3.85it/s]
|
|||
|
|
Processing model.layers.40.self_attn.v_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.40.self_attn.v_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 10.66it/s][A
|
|||
|
|
[A
Merging models: 89%|████████▊ | 386/435 [01:52<00:11, 4.20it/s]
|
|||
|
|
Processing model.layers.5.mlp.down_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.5.mlp.down_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 3.70it/s][A
|
|||
|
|
Processing model.layers.5.mlp.down_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 3.94it/s][A
|
|||
|
|
[A
Merging models: 89%|████████▉ | 387/435 [01:52<00:15, 3.12it/s]
|
|||
|
|
Processing model.layers.43.mlp.gate_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.43.mlp.gate_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 4.66it/s][A
|
|||
|
|
Processing model.layers.43.mlp.gate_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.38it/s][A
|
|||
|
|
[A
Merging models: 89%|████████▉ | 388/435 [01:53<00:16, 2.78it/s]
|
|||
|
|
Processing model.layers.30.self_attn.q_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.30.self_attn.q_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 7.59it/s][A
|
|||
|
|
Processing model.layers.30.self_attn.q_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.23it/s][A
|
|||
|
|
[A
Merging models: 89%|████████▉ | 389/435 [01:53<00:15, 2.99it/s]
|
|||
|
|
Processing model.layers.5.input_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.5.input_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 10.98it/s][A
|
|||
|
|
[A
Merging models: 90%|████████▉ | 390/435 [01:53<00:13, 3.46it/s]
|
|||
|
|
Processing model.layers.18.self_attn.q_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.18.self_attn.q_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 6.60it/s][A
|
|||
|
|
Processing model.layers.18.self_attn.q_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 6.82it/s][A
|
|||
|
|
[A
Merging models: 90%|████████▉ | 391/435 [01:53<00:12, 3.44it/s]
|
|||
|
|
Processing model.layers.45.mlp.down_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.45.mlp.down_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 4.63it/s][A
|
|||
|
|
Processing model.layers.45.mlp.down_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.43it/s][A
|
|||
|
|
[A
Merging models: 90%|█████████ | 392/435 [01:54<00:14, 2.96it/s]
|
|||
|
|
Processing model.layers.20.self_attn.o_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.20.self_attn.o_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 6.98it/s][A
|
|||
|
|
Processing model.layers.20.self_attn.o_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.04it/s][A
|
|||
|
|
[A
Merging models: 90%|█████████ | 393/435 [01:54<00:13, 3.10it/s]
|
|||
|
|
Processing model.layers.38.post_attention_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.38.post_attention_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 11.17it/s][A
|
|||
|
|
[A
Merging models: 91%|█████████ | 394/435 [01:54<00:11, 3.58it/s]
|
|||
|
|
Processing model.layers.21.mlp.up_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.21.mlp.up_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 4.73it/s][A
|
|||
|
|
Processing model.layers.21.mlp.up_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.50it/s][A
|
|||
|
|
[A
Merging models: 91%|█████████ | 395/435 [01:55<00:13, 3.05it/s]
|
|||
|
|
Processing model.layers.22.post_attention_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.22.post_attention_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 11.14it/s][A
|
|||
|
|
[A
Merging models: 91%|█████████ | 396/435 [01:55<00:11, 3.53it/s]
|
|||
|
|
Processing model.layers.4.mlp.down_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.4.mlp.down_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 3.70it/s][A
|
|||
|
|
Processing model.layers.4.mlp.down_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.09it/s][A
|
|||
|
|
[A
Merging models: 91%|█████████▏| 397/435 [01:55<00:13, 2.88it/s]
|
|||
|
|
Processing model.layers.35.mlp.down_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.35.mlp.down_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 4.81it/s][A
|
|||
|
|
Processing model.layers.35.mlp.down_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.59it/s][A
|
|||
|
|
[A
Merging models: 91%|█████████▏| 398/435 [01:56<00:13, 2.68it/s]
|
|||
|
|
Processing model.layers.18.post_attention_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.18.post_attention_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 10.98it/s][A
|
|||
|
|
[A
Merging models: 92%|█████████▏| 399/435 [01:56<00:11, 3.16it/s]
|
|||
|
|
Processing model.layers.38.self_attn.q_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.38.self_attn.q_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 7.68it/s][A
|
|||
|
|
Processing model.layers.38.self_attn.q_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.35it/s][A
|
|||
|
|
[A
Merging models: 92%|█████████▏| 400/435 [01:56<00:10, 3.31it/s]
|
|||
|
|
Processing model.layers.22.input_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.22.input_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 11.13it/s][A
|
|||
|
|
[A
Merging models: 92%|█████████▏| 401/435 [01:56<00:09, 3.76it/s]
|
|||
|
|
Processing model.layers.16.self_attn.o_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.16.self_attn.o_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 6.81it/s][A
|
|||
|
|
Processing model.layers.16.self_attn.o_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.06it/s][A
|
|||
|
|
[A
Merging models: 92%|█████████▏| 402/435 [01:57<00:08, 3.68it/s]
|
|||
|
|
Processing model.layers.7.mlp.gate_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.7.mlp.gate_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 2.87it/s][A
|
|||
|
|
Processing model.layers.7.mlp.gate_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 3.67it/s][A
|
|||
|
|
[A
Merging models: 93%|█████████▎| 403/435 [01:57<00:11, 2.77it/s]
|
|||
|
|
Processing model.layers.13.self_attn.k_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.13.self_attn.k_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 9.82it/s][A
|
|||
|
|
[A
Merging models: 93%|█████████▎| 404/435 [01:58<00:09, 3.20it/s]
|
|||
|
|
Processing model.layers.43.post_attention_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.43.post_attention_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 11.16it/s][A
|
|||
|
|
[A
Merging models: 93%|█████████▎| 405/435 [01:58<00:08, 3.67it/s]
|
|||
|
|
Processing model.layers.44.self_attn.k_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.44.self_attn.k_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 10.35it/s][A
|
|||
|
|
[A
Merging models: 93%|█████████▎| 406/435 [01:58<00:07, 4.02it/s]
|
|||
|
|
Processing model.layers.29.self_attn.q_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.29.self_attn.q_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 7.88it/s][A
|
|||
|
|
Processing model.layers.29.self_attn.q_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.51it/s][A
|
|||
|
|
[A
Merging models: 94%|█████████▎| 407/435 [01:58<00:07, 3.94it/s]
|
|||
|
|
Processing model.layers.39.input_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.39.input_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 11.17it/s][A
|
|||
|
|
[A
Merging models: 94%|█████████▍| 408/435 [01:58<00:06, 4.32it/s]
|
|||
|
|
Processing model.layers.33.mlp.gate_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.33.mlp.gate_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 4.95it/s][A
|
|||
|
|
Processing model.layers.33.mlp.gate_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.72it/s][A
|
|||
|
|
[A
Merging models: 94%|█████████▍| 409/435 [01:59<00:07, 3.47it/s]
|
|||
|
|
Processing model.layers.8.self_attn.o_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.8.self_attn.o_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 7.28it/s][A
|
|||
|
|
Processing model.layers.8.self_attn.o_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.32it/s][A
|
|||
|
|
[A
Merging models: 94%|█████████▍| 410/435 [01:59<00:07, 3.52it/s]
|
|||
|
|
Processing model.layers.47.self_attn.v_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.47.self_attn.v_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 10.49it/s][A
|
|||
|
|
[A
Merging models: 94%|█████████▍| 411/435 [01:59<00:06, 3.90it/s]
|
|||
|
|
Processing model.layers.27.mlp.up_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.27.mlp.up_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 5.08it/s][A
|
|||
|
|
Processing model.layers.27.mlp.up_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.78it/s][A
|
|||
|
|
[A
Merging models: 95%|█████████▍| 412/435 [02:00<00:06, 3.29it/s]
|
|||
|
|
Processing model.layers.13.input_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.13.input_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 11.02it/s][A
|
|||
|
|
[A
Merging models: 95%|█████████▍| 413/435 [02:00<00:05, 3.74it/s]
|
|||
|
|
Processing model.layers.16.self_attn.v_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.16.self_attn.v_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 9.72it/s][A
|
|||
|
|
[A
Merging models: 95%|█████████▌| 414/435 [02:00<00:05, 4.05it/s]
|
|||
|
|
Processing model.layers.15.input_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.15.input_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 10.97it/s][A
|
|||
|
|
[A
Merging models: 95%|█████████▌| 415/435 [02:00<00:04, 4.39it/s]
|
|||
|
|
Processing model.layers.2.mlp.down_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.2.mlp.down_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 3.95it/s][A
|
|||
|
|
Processing model.layers.2.mlp.down_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 3.54it/s][A
|
|||
|
|
[A
Merging models: 96%|█████████▌| 416/435 [02:01<00:06, 3.06it/s]
|
|||
|
|
Processing model.layers.42.self_attn.k_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.42.self_attn.k_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 10.44it/s][A
|
|||
|
|
[A
Merging models: 96%|█████████▌| 417/435 [02:01<00:05, 3.50it/s]
|
|||
|
|
Processing model.layers.32.self_attn.k_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.32.self_attn.k_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 10.55it/s][A
|
|||
|
|
[A
Merging models: 96%|█████████▌| 418/435 [02:01<00:04, 3.89it/s]
|
|||
|
|
Processing model.layers.3.mlp.down_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.3.mlp.down_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 3.92it/s][A
|
|||
|
|
Processing model.layers.3.mlp.down_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.26it/s][A
|
|||
|
|
[A
Merging models: 96%|█████████▋| 419/435 [02:02<00:05, 3.10it/s]
|
|||
|
|
Processing model.layers.45.self_attn.q_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.45.self_attn.q_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 7.52it/s][A
|
|||
|
|
Processing model.layers.45.self_attn.q_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.32it/s][A
|
|||
|
|
[A
Merging models: 97%|█████████▋| 420/435 [02:02<00:04, 3.25it/s]
|
|||
|
|
Processing model.layers.4.self_attn.q_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.4.self_attn.q_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 6.79it/s][A
|
|||
|
|
Processing model.layers.4.self_attn.q_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.06it/s][A
|
|||
|
|
[A
Merging models: 97%|█████████▋| 421/435 [02:02<00:04, 3.32it/s]
|
|||
|
|
Processing model.layers.23.self_attn.q_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.23.self_attn.q_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 7.68it/s][A
|
|||
|
|
Processing model.layers.23.self_attn.q_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.40it/s][A
|
|||
|
|
[A
Merging models: 97%|█████████▋| 422/435 [02:02<00:03, 3.43it/s]
|
|||
|
|
Processing model.layers.31.post_attention_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.31.post_attention_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 11.16it/s][A
|
|||
|
|
[A
Merging models: 97%|█████████▋| 423/435 [02:03<00:03, 3.88it/s]
|
|||
|
|
Processing model.layers.12.mlp.down_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.12.mlp.down_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 2.82it/s][A
|
|||
|
|
Processing model.layers.12.mlp.down_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 3.56it/s][A
|
|||
|
|
[A
Merging models: 97%|█████████▋| 424/435 [02:03<00:03, 2.81it/s]
|
|||
|
|
Processing model.layers.33.mlp.up_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.33.mlp.up_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 4.68it/s][A
|
|||
|
|
Processing model.layers.33.mlp.up_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.43it/s][A
|
|||
|
|
[A
Merging models: 98%|█████████▊| 425/435 [02:04<00:03, 2.61it/s]
|
|||
|
|
Processing model.layers.7.mlp.down_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.7.mlp.down_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 3.71it/s][A
|
|||
|
|
Processing model.layers.7.mlp.down_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 3.38it/s][A
|
|||
|
|
[A
Merging models: 98%|█████████▊| 426/435 [02:04<00:03, 2.25it/s]
|
|||
|
|
Processing model.layers.39.post_attention_layernorm.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.39.post_attention_layernorm.weight: 100%|██████████| 2/2 [00:00<00:00, 11.18it/s][A
|
|||
|
|
[A
Merging models: 98%|█████████▊| 427/435 [02:04<00:02, 2.74it/s]
|
|||
|
|
Processing model.layers.9.mlp.up_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.9.mlp.up_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 3.82it/s][A
|
|||
|
|
Processing model.layers.9.mlp.up_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 4.20it/s][A
|
|||
|
|
[A
Merging models: 98%|█████████▊| 428/435 [02:05<00:02, 2.50it/s]
|
|||
|
|
Processing model.layers.15.mlp.gate_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.15.mlp.gate_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 3.00it/s][A
|
|||
|
|
Processing model.layers.15.mlp.gate_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 3.76it/s][A
|
|||
|
|
[A
Merging models: 99%|█████████▊| 429/435 [02:05<00:02, 2.24it/s]
|
|||
|
|
Processing model.layers.26.self_attn.q_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.26.self_attn.q_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 7.79it/s][A
|
|||
|
|
Processing model.layers.26.self_attn.q_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.42it/s][A
|
|||
|
|
[A
Merging models: 99%|█████████▉| 430/435 [02:06<00:01, 2.55it/s]
|
|||
|
|
Processing model.layers.29.self_attn.k_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.29.self_attn.k_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 10.68it/s][A
|
|||
|
|
[A
Merging models: 99%|█████████▉| 431/435 [02:06<00:01, 3.02it/s]
|
|||
|
|
Processing model.layers.5.self_attn.q_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.5.self_attn.q_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 7.02it/s][A
|
|||
|
|
Processing model.layers.5.self_attn.q_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.20it/s][A
|
|||
|
|
[A
Merging models: 99%|█████████▉| 432/435 [02:06<00:00, 3.17it/s]
|
|||
|
|
Processing model.layers.18.self_attn.o_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.18.self_attn.o_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 7.00it/s][A
|
|||
|
|
Processing model.layers.18.self_attn.o_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.21it/s][A
|
|||
|
|
[A
Merging models: 100%|█████████▉| 433/435 [02:06<00:00, 3.28it/s]
|
|||
|
|
Processing model.layers.2.self_attn.v_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.2.self_attn.v_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 9.85it/s][A
|
|||
|
|
[A
Merging models: 100%|█████████▉| 434/435 [02:07<00:00, 3.67it/s]
|
|||
|
|
Processing model.layers.3.self_attn.o_proj.weight: 0%| | 0/2 [00:00<?, ?it/s][A
|
|||
|
|
Processing model.layers.3.self_attn.o_proj.weight: 50%|█████ | 1/2 [00:00<00:00, 7.34it/s][A
|
|||
|
|
Processing model.layers.3.self_attn.o_proj.weight: 100%|██████████| 2/2 [00:00<00:00, 7.37it/s][A
|
|||
|
|
[A
Merging models: 100%|██████████| 435/435 [02:07<00:00, 3.67it/s]
Merging models: 100%|██████████| 435/435 [02:07<00:00, 3.41it/s]
|
|||
|
|
create a temporary file to store mixed weights: /tmp/tmp_s47mw59.ckpt
|
|||
|
|
***weight for each model***:
|
|||
|
|
/media/hangyu5/Home/Documents/Hugging-Face/LM_cocktail/meow 0.5
|
|||
|
|
/media/hangyu5/Home/Documents/Hugging-Face/LM_cocktail/SOLAR-10.7B-Instruct-v1.0 0.5
|
|||
|
|
Loading checkpoint shards: 0%| | 0/5 [00:00<?, ?it/s]/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.embed_tokens.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.0.self_attn.q_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.0.self_attn.k_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.0.self_attn.v_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.0.self_attn.o_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.0.mlp.gate_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.0.mlp.up_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.0.mlp.down_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.0.input_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.0.post_attention_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.1.self_attn.q_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.1.self_attn.k_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.1.self_attn.v_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.1.self_attn.o_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.1.mlp.gate_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.1.mlp.up_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.1.mlp.down_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.1.input_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.1.post_attention_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.2.self_attn.q_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.2.self_attn.k_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.2.self_attn.v_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.2.self_attn.o_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.2.mlp.gate_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.2.mlp.up_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.2.mlp.down_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.2.input_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.2.post_attention_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.3.self_attn.q_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.3.self_attn.k_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.3.self_attn.v_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.3.self_attn.o_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.3.mlp.gate_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.3.mlp.up_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.3.mlp.down_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.3.input_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.3.post_attention_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.4.self_attn.q_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.4.self_attn.k_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.4.self_attn.v_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.4.self_attn.o_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.4.mlp.gate_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.4.mlp.up_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.4.mlp.down_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.4.input_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.4.post_attention_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.5.self_attn.q_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.5.self_attn.k_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.5.self_attn.v_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.5.self_attn.o_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.5.mlp.gate_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.5.mlp.up_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.5.mlp.down_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.5.input_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.5.post_attention_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.6.self_attn.q_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.6.self_attn.k_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.6.self_attn.v_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.6.self_attn.o_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.6.mlp.gate_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.6.mlp.up_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.6.mlp.down_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.6.input_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.6.post_attention_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.7.self_attn.q_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.7.self_attn.k_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.7.self_attn.v_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.7.self_attn.o_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.7.mlp.gate_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.7.mlp.up_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.7.mlp.down_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.7.input_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.7.post_attention_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.8.self_attn.q_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.8.self_attn.k_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.8.self_attn.v_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.8.self_attn.o_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.8.mlp.gate_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.8.mlp.up_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.8.mlp.down_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.8.input_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.8.post_attention_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.9.self_attn.q_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.9.self_attn.k_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.9.self_attn.v_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.9.self_attn.o_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.9.mlp.gate_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.9.mlp.up_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.9.mlp.down_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.9.input_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.9.post_attention_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.10.self_attn.q_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.10.self_attn.k_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.10.self_attn.v_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.10.self_attn.o_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.10.mlp.gate_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.10.mlp.up_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
Loading checkpoint shards: 20%|██ | 1/5 [00:00<00:00, 8.42it/s]/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.10.mlp.down_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.10.input_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.10.post_attention_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.11.self_attn.q_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.11.self_attn.k_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.11.self_attn.v_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.11.self_attn.o_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.11.mlp.gate_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.11.mlp.up_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.11.mlp.down_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.11.input_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.11.post_attention_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.12.self_attn.q_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.12.self_attn.k_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.12.self_attn.v_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.12.self_attn.o_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.12.mlp.gate_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.12.mlp.up_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.12.mlp.down_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.12.input_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.12.post_attention_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.13.self_attn.q_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.13.self_attn.k_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.13.self_attn.v_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.13.self_attn.o_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.13.mlp.gate_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.13.mlp.up_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.13.mlp.down_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.13.input_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.13.post_attention_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.14.self_attn.q_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.14.self_attn.k_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.14.self_attn.v_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.14.self_attn.o_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.14.mlp.gate_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.14.mlp.up_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.14.mlp.down_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.14.input_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.14.post_attention_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.15.self_attn.q_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.15.self_attn.k_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.15.self_attn.v_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.15.self_attn.o_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.15.mlp.gate_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.15.mlp.up_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.15.mlp.down_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.15.input_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.15.post_attention_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.16.self_attn.q_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.16.self_attn.k_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.16.self_attn.v_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.16.self_attn.o_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.16.mlp.gate_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.16.mlp.up_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.16.mlp.down_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.16.input_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.16.post_attention_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.17.self_attn.q_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.17.self_attn.k_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.17.self_attn.v_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.17.self_attn.o_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.17.mlp.gate_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.17.mlp.up_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.17.mlp.down_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.17.input_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.17.post_attention_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.18.self_attn.q_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.18.self_attn.k_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.18.self_attn.v_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.18.self_attn.o_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.18.mlp.gate_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.18.mlp.up_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.18.mlp.down_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.18.input_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.18.post_attention_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.19.self_attn.q_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.19.self_attn.k_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.19.self_attn.v_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.19.self_attn.o_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.19.mlp.gate_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.19.mlp.up_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.19.mlp.down_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.19.input_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.19.post_attention_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.20.self_attn.q_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.20.self_attn.k_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.20.self_attn.v_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.20.self_attn.o_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.20.mlp.gate_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.20.mlp.up_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.20.mlp.down_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.20.input_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.20.post_attention_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.21.self_attn.q_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.21.self_attn.k_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.21.self_attn.v_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.21.self_attn.o_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.21.mlp.gate_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.21.mlp.up_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.21.mlp.down_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.21.input_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.21.post_attention_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.22.self_attn.q_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.22.self_attn.k_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.22.self_attn.v_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.22.self_attn.o_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
Loading checkpoint shards: 40%|████ | 2/5 [00:00<00:00, 8.61it/s]/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.22.mlp.gate_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.22.mlp.up_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.22.mlp.down_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.22.input_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.22.post_attention_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.23.self_attn.q_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.23.self_attn.k_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.23.self_attn.v_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.23.self_attn.o_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.23.mlp.gate_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.23.mlp.up_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.23.mlp.down_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.23.input_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.23.post_attention_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.24.self_attn.q_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.24.self_attn.k_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.24.self_attn.v_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.24.self_attn.o_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.24.mlp.gate_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.24.mlp.up_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.24.mlp.down_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.24.input_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.24.post_attention_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.25.self_attn.q_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.25.self_attn.k_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.25.self_attn.v_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.25.self_attn.o_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.25.mlp.gate_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.25.mlp.up_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.25.mlp.down_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.25.input_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.25.post_attention_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.26.self_attn.q_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.26.self_attn.k_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.26.self_attn.v_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.26.self_attn.o_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.26.mlp.gate_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.26.mlp.up_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.26.mlp.down_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.26.input_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.26.post_attention_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.27.self_attn.q_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.27.self_attn.k_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.27.self_attn.v_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.27.self_attn.o_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.27.mlp.gate_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.27.mlp.up_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.27.mlp.down_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.27.input_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.27.post_attention_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.28.self_attn.q_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.28.self_attn.k_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.28.self_attn.v_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.28.self_attn.o_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.28.mlp.gate_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.28.mlp.up_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.28.mlp.down_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.28.input_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.28.post_attention_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.29.self_attn.q_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.29.self_attn.k_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.29.self_attn.v_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.29.self_attn.o_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.29.mlp.gate_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.29.mlp.up_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.29.mlp.down_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.29.input_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.29.post_attention_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.30.self_attn.q_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.30.self_attn.k_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.30.self_attn.v_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.30.self_attn.o_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.30.mlp.gate_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.30.mlp.up_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.30.mlp.down_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.30.input_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.30.post_attention_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.31.self_attn.q_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.31.self_attn.k_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.31.self_attn.v_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.31.self_attn.o_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.31.mlp.gate_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.31.mlp.up_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.31.mlp.down_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.31.input_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.31.post_attention_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.32.self_attn.q_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.32.self_attn.k_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.32.self_attn.v_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.32.self_attn.o_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.32.mlp.gate_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.32.mlp.up_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.32.mlp.down_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.32.input_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.32.post_attention_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.33.self_attn.q_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.33.self_attn.k_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.33.self_attn.v_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.33.self_attn.o_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.33.mlp.gate_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
Loading checkpoint shards: 60%|██████ | 3/5 [00:00<00:00, 8.84it/s]/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.33.mlp.up_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.33.mlp.down_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.33.input_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.33.post_attention_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.34.self_attn.q_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.34.self_attn.k_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.34.self_attn.v_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.34.self_attn.o_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.34.mlp.gate_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.34.mlp.up_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.34.mlp.down_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.34.input_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.34.post_attention_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.35.self_attn.q_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.35.self_attn.k_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.35.self_attn.v_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.35.self_attn.o_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.35.mlp.gate_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.35.mlp.up_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.35.mlp.down_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.35.input_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.35.post_attention_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.36.self_attn.q_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.36.self_attn.k_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.36.self_attn.v_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.36.self_attn.o_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.36.mlp.gate_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.36.mlp.up_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.36.mlp.down_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.36.input_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.36.post_attention_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.37.self_attn.q_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.37.self_attn.k_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.37.self_attn.v_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.37.self_attn.o_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.37.mlp.gate_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.37.mlp.up_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.37.mlp.down_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.37.input_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.37.post_attention_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.38.self_attn.q_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.38.self_attn.k_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.38.self_attn.v_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.38.self_attn.o_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.38.mlp.gate_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.38.mlp.up_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.38.mlp.down_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.38.input_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.38.post_attention_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.39.self_attn.q_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.39.self_attn.k_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.39.self_attn.v_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.39.self_attn.o_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.39.mlp.gate_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.39.mlp.up_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.39.mlp.down_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.39.input_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.39.post_attention_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.40.self_attn.q_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.40.self_attn.k_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.40.self_attn.v_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.40.self_attn.o_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.40.mlp.gate_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.40.mlp.up_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.40.mlp.down_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.40.input_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.40.post_attention_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.41.self_attn.q_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.41.self_attn.k_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.41.self_attn.v_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.41.self_attn.o_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.41.mlp.gate_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.41.mlp.up_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.41.mlp.down_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.41.input_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.41.post_attention_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.42.self_attn.q_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.42.self_attn.k_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.42.self_attn.v_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.42.self_attn.o_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.42.mlp.gate_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.42.mlp.up_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.42.mlp.down_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.42.input_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.42.post_attention_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.43.self_attn.q_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.43.self_attn.k_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.43.self_attn.v_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.43.self_attn.o_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.43.mlp.gate_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.43.mlp.up_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.43.mlp.down_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.43.input_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.43.post_attention_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.44.self_attn.q_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.44.self_attn.k_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.44.self_attn.v_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.44.self_attn.o_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.44.mlp.gate_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.44.mlp.up_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
Loading checkpoint shards: 80%|████████ | 4/5 [00:00<00:00, 9.08it/s]/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.44.mlp.down_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.44.input_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.44.post_attention_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.45.self_attn.q_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.45.self_attn.k_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.45.self_attn.v_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.45.self_attn.o_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.45.mlp.gate_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.45.mlp.up_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.45.mlp.down_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.45.input_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.45.post_attention_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.46.self_attn.q_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.46.self_attn.k_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.46.self_attn.v_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.46.self_attn.o_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.46.mlp.gate_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.46.mlp.up_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.46.mlp.down_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.46.input_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.46.post_attention_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.47.self_attn.q_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.47.self_attn.k_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.47.self_attn.v_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.47.self_attn.o_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.47.mlp.gate_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.47.mlp.up_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.47.mlp.down_proj.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.47.input_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.layers.47.post_attention_layernorm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for model.norm.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
/home/hangyu5/anaconda3/envs/llmcocktail/lib/python3.11/site-packages/torch/nn/modules/module.py:2025: UserWarning: for lm_head.weight: copying from a non-meta parameter in the checkpoint to a meta parameter in the current model, which is a no-op. (Did you mean to pass `assign=True` to assign items in the state dictionary to their corresponding key in the module instead of copying them in place?)
|
|||
|
|
warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
|
|||
|
|
Loading checkpoint shards: 100%|██████████| 5/5 [00:00<00:00, 9.29it/s]
Loading checkpoint shards: 100%|██████████| 5/5 [00:00<00:00, 9.07it/s]
|
|||
|
|
Remove temporary file: /tmp/tmp_s47mw59.ckpt
|
|||
|
|
Remove temporary directory: /tmp/tmp3n6qz8ik
|
|||
|
|
Saving the new model to ./mixed_llm
|
|||
|
|
LlamaForCausalLM(
|
|||
|
|
(model): LlamaModel(
|
|||
|
|
(embed_tokens): Embedding(32000, 4096, padding_idx=2)
|
|||
|
|
(layers): ModuleList(
|
|||
|
|
(0-47): 48 x LlamaDecoderLayer(
|
|||
|
|
(self_attn): LlamaSdpaAttention(
|
|||
|
|
(q_proj): Linear(in_features=4096, out_features=4096, bias=False)
|
|||
|
|
(k_proj): Linear(in_features=4096, out_features=1024, bias=False)
|
|||
|
|
(v_proj): Linear(in_features=4096, out_features=1024, bias=False)
|
|||
|
|
(o_proj): Linear(in_features=4096, out_features=4096, bias=False)
|
|||
|
|
(rotary_emb): LlamaRotaryEmbedding()
|
|||
|
|
)
|
|||
|
|
(mlp): LlamaMLP(
|
|||
|
|
(gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
|
|||
|
|
(up_proj): Linear(in_features=4096, out_features=14336, bias=False)
|
|||
|
|
(down_proj): Linear(in_features=14336, out_features=4096, bias=False)
|
|||
|
|
(act_fn): SiLU()
|
|||
|
|
)
|
|||
|
|
(input_layernorm): LlamaRMSNorm()
|
|||
|
|
(post_attention_layernorm): LlamaRMSNorm()
|
|||
|
|
)
|
|||
|
|
)
|
|||
|
|
(norm): LlamaRMSNorm()
|
|||
|
|
)
|
|||
|
|
(lm_head): Linear(in_features=4096, out_features=32000, bias=False)
|
|||
|
|
)
|