初始化项目,由ModelHub XC社区提供模型
Model: DCAgent/g1_gptlong_top8_32b Source: Original Platform
This commit is contained in:
36
.gitattributes
vendored
Normal file
36
.gitattributes
vendored
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
*.7z filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.arrow filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.bin filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.ftz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.gz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.h5 filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.joblib filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.model filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.npy filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.npz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.onnx filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.ot filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.parquet filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pb filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pickle filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pkl filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pt filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pth filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.rar filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
||||||
|
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tar filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tflite filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tgz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.wasm filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.xz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.zip filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.zst filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
||||||
60
README.md
Normal file
60
README.md
Normal file
@@ -0,0 +1,60 @@
|
|||||||
|
---
|
||||||
|
library_name: transformers
|
||||||
|
license: other
|
||||||
|
base_model: Qwen/Qwen3-32B
|
||||||
|
tags:
|
||||||
|
- llama-factory
|
||||||
|
- full
|
||||||
|
- generated_from_trainer
|
||||||
|
model-index:
|
||||||
|
- name: sft_g1_gptlong_top8_32b__Qwen3-32B
|
||||||
|
results: []
|
||||||
|
---
|
||||||
|
|
||||||
|
<!-- This model card has been generated automatically according to the information the Trainer had access to. You
|
||||||
|
should probably proofread and complete it, then remove this comment. -->
|
||||||
|
|
||||||
|
# sft_g1_gptlong_top8_32b__Qwen3-32B
|
||||||
|
|
||||||
|
This model is a fine-tuned version of [Qwen/Qwen3-32B](https://huggingface.co/Qwen/Qwen3-32B) on the /e/scratch/jureap59/raoof1/sft_data/hf_hub/datasets--DCAgent--g1_min_episodes_e1_gpt_long_top8_glm47_traces/snapshots/9828cc7d5cb31c19ed7e6dead76bd24dc2d66262_thinking_preprocessed dataset.
|
||||||
|
|
||||||
|
## Model description
|
||||||
|
|
||||||
|
More information needed
|
||||||
|
|
||||||
|
## Intended uses & limitations
|
||||||
|
|
||||||
|
More information needed
|
||||||
|
|
||||||
|
## Training and evaluation data
|
||||||
|
|
||||||
|
More information needed
|
||||||
|
|
||||||
|
## Training procedure
|
||||||
|
|
||||||
|
### Training hyperparameters
|
||||||
|
|
||||||
|
The following hyperparameters were used during training:
|
||||||
|
- learning_rate: 4e-05
|
||||||
|
- train_batch_size: 1
|
||||||
|
- eval_batch_size: 8
|
||||||
|
- seed: 42
|
||||||
|
- distributed_type: multi-GPU
|
||||||
|
- num_devices: 96
|
||||||
|
- total_train_batch_size: 96
|
||||||
|
- total_eval_batch_size: 768
|
||||||
|
- optimizer: Use OptimizerNames.ADAMW_TORCH_FUSED with betas=(0.9,0.98) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
|
||||||
|
- lr_scheduler_type: cosine
|
||||||
|
- lr_scheduler_warmup_ratio: 0.1
|
||||||
|
- num_epochs: 5.0
|
||||||
|
|
||||||
|
### Training results
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
### Framework versions
|
||||||
|
|
||||||
|
- Transformers 4.57.6
|
||||||
|
- Pytorch 2.9.1+cu130
|
||||||
|
- Datasets 4.7.0
|
||||||
|
- Tokenizers 0.22.2
|
||||||
28
added_tokens.json
Normal file
28
added_tokens.json
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
{
|
||||||
|
"</think>": 151668,
|
||||||
|
"</tool_call>": 151658,
|
||||||
|
"</tool_response>": 151666,
|
||||||
|
"<think>": 151667,
|
||||||
|
"<tool_call>": 151657,
|
||||||
|
"<tool_response>": 151665,
|
||||||
|
"<|box_end|>": 151649,
|
||||||
|
"<|box_start|>": 151648,
|
||||||
|
"<|endoftext|>": 151643,
|
||||||
|
"<|file_sep|>": 151664,
|
||||||
|
"<|fim_middle|>": 151660,
|
||||||
|
"<|fim_pad|>": 151662,
|
||||||
|
"<|fim_prefix|>": 151659,
|
||||||
|
"<|fim_suffix|>": 151661,
|
||||||
|
"<|im_end|>": 151645,
|
||||||
|
"<|im_start|>": 151644,
|
||||||
|
"<|image_pad|>": 151655,
|
||||||
|
"<|object_ref_end|>": 151647,
|
||||||
|
"<|object_ref_start|>": 151646,
|
||||||
|
"<|quad_end|>": 151651,
|
||||||
|
"<|quad_start|>": 151650,
|
||||||
|
"<|repo_name|>": 151663,
|
||||||
|
"<|video_pad|>": 151656,
|
||||||
|
"<|vision_end|>": 151653,
|
||||||
|
"<|vision_pad|>": 151654,
|
||||||
|
"<|vision_start|>": 151652
|
||||||
|
}
|
||||||
16
all_results.json
Normal file
16
all_results.json
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
{
|
||||||
|
"achieved_tflops_per_gpu": 0.010354545414379324,
|
||||||
|
"achieved_tflops_per_gpu_theoretical": 1519.821207581491,
|
||||||
|
"epoch": 5.0,
|
||||||
|
"loss_nan_ranks": 0,
|
||||||
|
"loss_rank_avg": 0.21731798350811005,
|
||||||
|
"mfu_percent": 0.0007317699939490688,
|
||||||
|
"mfu_percent_theoretical": 107.40785919303823,
|
||||||
|
"total_flos": 9850857283649536.0,
|
||||||
|
"train_loss": 0.03974963869139502,
|
||||||
|
"train_runtime": 9909.9567,
|
||||||
|
"train_samples_per_second": 14.475,
|
||||||
|
"train_steps_per_second": 0.151,
|
||||||
|
"valid_targets_mean": 1465.8,
|
||||||
|
"valid_targets_min": 325
|
||||||
|
}
|
||||||
89
chat_template.jinja
Normal file
89
chat_template.jinja
Normal file
@@ -0,0 +1,89 @@
|
|||||||
|
{%- if tools %}
|
||||||
|
{{- '<|im_start|>system\n' }}
|
||||||
|
{%- if messages[0].role == 'system' %}
|
||||||
|
{{- messages[0].content + '\n\n' }}
|
||||||
|
{%- endif %}
|
||||||
|
{{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
|
||||||
|
{%- for tool in tools %}
|
||||||
|
{{- "\n" }}
|
||||||
|
{{- tool | tojson }}
|
||||||
|
{%- endfor %}
|
||||||
|
{{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
|
||||||
|
{%- else %}
|
||||||
|
{%- if messages[0].role == 'system' %}
|
||||||
|
{{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
|
||||||
|
{%- endif %}
|
||||||
|
{%- endif %}
|
||||||
|
{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
|
||||||
|
{%- for message in messages[::-1] %}
|
||||||
|
{%- set index = (messages|length - 1) - loop.index0 %}
|
||||||
|
{%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
|
||||||
|
{%- set ns.multi_step_tool = false %}
|
||||||
|
{%- set ns.last_query_index = index %}
|
||||||
|
{%- endif %}
|
||||||
|
{%- endfor %}
|
||||||
|
{%- for message in messages %}
|
||||||
|
{%- if message.content is string %}
|
||||||
|
{%- set content = message.content %}
|
||||||
|
{%- else %}
|
||||||
|
{%- set content = '' %}
|
||||||
|
{%- endif %}
|
||||||
|
{%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
|
||||||
|
{{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
|
||||||
|
{%- elif message.role == "assistant" %}
|
||||||
|
{%- set reasoning_content = '' %}
|
||||||
|
{%- if message.reasoning_content is string %}
|
||||||
|
{%- set reasoning_content = message.reasoning_content %}
|
||||||
|
{%- else %}
|
||||||
|
{%- if '</think>' in content %}
|
||||||
|
{%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
|
||||||
|
{%- set content = content.split('</think>')[-1].lstrip('\n') %}
|
||||||
|
{%- endif %}
|
||||||
|
{%- endif %}
|
||||||
|
{%- if loop.index0 > ns.last_query_index %}
|
||||||
|
{%- if loop.last or (not loop.last and reasoning_content) %}
|
||||||
|
{{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
|
||||||
|
{%- else %}
|
||||||
|
{{- '<|im_start|>' + message.role + '\n' + content }}
|
||||||
|
{%- endif %}
|
||||||
|
{%- else %}
|
||||||
|
{{- '<|im_start|>' + message.role + '\n' + content }}
|
||||||
|
{%- endif %}
|
||||||
|
{%- if message.tool_calls %}
|
||||||
|
{%- for tool_call in message.tool_calls %}
|
||||||
|
{%- if (loop.first and content) or (not loop.first) %}
|
||||||
|
{{- '\n' }}
|
||||||
|
{%- endif %}
|
||||||
|
{%- if tool_call.function %}
|
||||||
|
{%- set tool_call = tool_call.function %}
|
||||||
|
{%- endif %}
|
||||||
|
{{- '<tool_call>\n{"name": "' }}
|
||||||
|
{{- tool_call.name }}
|
||||||
|
{{- '", "arguments": ' }}
|
||||||
|
{%- if tool_call.arguments is string %}
|
||||||
|
{{- tool_call.arguments }}
|
||||||
|
{%- else %}
|
||||||
|
{{- tool_call.arguments | tojson }}
|
||||||
|
{%- endif %}
|
||||||
|
{{- '}\n</tool_call>' }}
|
||||||
|
{%- endfor %}
|
||||||
|
{%- endif %}
|
||||||
|
{{- '<|im_end|>\n' }}
|
||||||
|
{%- elif message.role == "tool" %}
|
||||||
|
{%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
|
||||||
|
{{- '<|im_start|>user' }}
|
||||||
|
{%- endif %}
|
||||||
|
{{- '\n<tool_response>\n' }}
|
||||||
|
{{- content }}
|
||||||
|
{{- '\n</tool_response>' }}
|
||||||
|
{%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
|
||||||
|
{{- '<|im_end|>\n' }}
|
||||||
|
{%- endif %}
|
||||||
|
{%- endif %}
|
||||||
|
{%- endfor %}
|
||||||
|
{%- if add_generation_prompt %}
|
||||||
|
{{- '<|im_start|>assistant\n' }}
|
||||||
|
{%- if enable_thinking is defined and enable_thinking is false %}
|
||||||
|
{{- '<think>\n\n</think>\n\n' }}
|
||||||
|
{%- endif %}
|
||||||
|
{%- endif %}
|
||||||
96
config.json
Normal file
96
config.json
Normal file
@@ -0,0 +1,96 @@
|
|||||||
|
{
|
||||||
|
"architectures": [
|
||||||
|
"Qwen3ForCausalLM"
|
||||||
|
],
|
||||||
|
"attention_bias": false,
|
||||||
|
"attention_dropout": 0.0,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"eos_token_id": 151645,
|
||||||
|
"head_dim": 128,
|
||||||
|
"hidden_act": "silu",
|
||||||
|
"hidden_size": 5120,
|
||||||
|
"initializer_range": 0.02,
|
||||||
|
"intermediate_size": 25600,
|
||||||
|
"layer_types": [
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention",
|
||||||
|
"full_attention"
|
||||||
|
],
|
||||||
|
"max_position_embeddings": 40960,
|
||||||
|
"max_window_layers": 64,
|
||||||
|
"model_type": "qwen3",
|
||||||
|
"num_attention_heads": 64,
|
||||||
|
"num_hidden_layers": 64,
|
||||||
|
"num_key_value_heads": 8,
|
||||||
|
"pad_token_id": 151643,
|
||||||
|
"rms_norm_eps": 1e-06,
|
||||||
|
"rope_scaling": null,
|
||||||
|
"rope_theta": 1000000,
|
||||||
|
"sliding_window": null,
|
||||||
|
"tie_word_embeddings": false,
|
||||||
|
"transformers_version": "4.57.6",
|
||||||
|
"use_cache": false,
|
||||||
|
"use_sliding_window": false,
|
||||||
|
"vocab_size": 151936
|
||||||
|
}
|
||||||
12
generation_config.json
Normal file
12
generation_config.json
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
{
|
||||||
|
"do_sample": true,
|
||||||
|
"eos_token_id": [
|
||||||
|
151645,
|
||||||
|
151643
|
||||||
|
],
|
||||||
|
"pad_token_id": 151643,
|
||||||
|
"temperature": 0.6,
|
||||||
|
"top_k": 20,
|
||||||
|
"top_p": 0.95,
|
||||||
|
"transformers_version": "4.57.6"
|
||||||
|
}
|
||||||
151388
merges.txt
Normal file
151388
merges.txt
Normal file
File diff suppressed because it is too large
Load Diff
3
model-00001-of-00014.safetensors
Normal file
3
model-00001-of-00014.safetensors
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:870ee9f46c8d5facda4a3b9315edcd76a1f0766bcda10e4b386e414b132a4184
|
||||||
|
size 4932307584
|
||||||
3
model-00002-of-00014.safetensors
Normal file
3
model-00002-of-00014.safetensors
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:81bc627c621b5c983ca07605d60ac038f2885f795e538e98cddfac3850faffe5
|
||||||
|
size 4875989696
|
||||||
3
model-00003-of-00014.safetensors
Normal file
3
model-00003-of-00014.safetensors
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:be969b1429d3a020a2f2abf702bc99509bb0c823fc0c567f511c2cda55c3354b
|
||||||
|
size 4875989720
|
||||||
3
model-00004-of-00014.safetensors
Normal file
3
model-00004-of-00014.safetensors
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:0d316fc9d7a5f311713ecd75df2b614a00ed2f4f039ace1a951790f05cb02220
|
||||||
|
size 4875989752
|
||||||
3
model-00005-of-00014.safetensors
Normal file
3
model-00005-of-00014.safetensors
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:6ada5802bfbe8a8f76ccfe9ecbaa63729035c175f55d46786d09decf24b293f0
|
||||||
|
size 4875989752
|
||||||
3
model-00006-of-00014.safetensors
Normal file
3
model-00006-of-00014.safetensors
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:ffe081c3ac992a055e593d8daa9ef0a87bfc8820dbb817af9dbb0ac914f71a3c
|
||||||
|
size 4875989752
|
||||||
3
model-00007-of-00014.safetensors
Normal file
3
model-00007-of-00014.safetensors
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:3e876cd68dc4341f8a03ff8d188bdcec4cfd83db29955d92a57cbe39100c31b1
|
||||||
|
size 4875989752
|
||||||
3
model-00008-of-00014.safetensors
Normal file
3
model-00008-of-00014.safetensors
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:d294973f0c4e80afd24458f7cb3344ae8acc256ddd69d042b47135df5d6d6d0c
|
||||||
|
size 4875989752
|
||||||
3
model-00009-of-00014.safetensors
Normal file
3
model-00009-of-00014.safetensors
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:265b102f5bba0cdc030f3755570d64b37f0ff20ba03f95014207a5da03d7c276
|
||||||
|
size 4875989752
|
||||||
3
model-00010-of-00014.safetensors
Normal file
3
model-00010-of-00014.safetensors
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:9e41cfef8952d6aa251dad2095f25b9a864658cff7c2a78640b62f44fc24faf2
|
||||||
|
size 4875989752
|
||||||
3
model-00011-of-00014.safetensors
Normal file
3
model-00011-of-00014.safetensors
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:a715c7d3191d0076f40d2584fd39be14d36c072d8ceb2ac66cbe151603292322
|
||||||
|
size 4875989752
|
||||||
3
model-00012-of-00014.safetensors
Normal file
3
model-00012-of-00014.safetensors
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:a429b7ea08bdf1e3dddbf63acc3e790d51ea4ac7b57a9b3736040cd44866aa98
|
||||||
|
size 4875989752
|
||||||
3
model-00013-of-00014.safetensors
Normal file
3
model-00013-of-00014.safetensors
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:b3654e4b77f9ba5a0e66b80c7efeb3293751b07e2c5b7c4c512824a9589ab562
|
||||||
|
size 4875989752
|
||||||
3
model-00014-of-00014.safetensors
Normal file
3
model-00014-of-00014.safetensors
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:cee186efdd1b9df03ed54ec89e07a5e2c849a0a2a8668cfee8942cd924ec72c5
|
||||||
|
size 2080144040
|
||||||
715
model.safetensors.index.json
Normal file
715
model.safetensors.index.json
Normal file
@@ -0,0 +1,715 @@
|
|||||||
|
{
|
||||||
|
"metadata": {
|
||||||
|
"total_parameters": 676864,
|
||||||
|
"total_size": 65524246528
|
||||||
|
},
|
||||||
|
"weight_map": {
|
||||||
|
"lm_head.weight": "model-00014-of-00014.safetensors",
|
||||||
|
"model.embed_tokens.weight": "model-00001-of-00014.safetensors",
|
||||||
|
"model.layers.0.input_layernorm.weight": "model-00001-of-00014.safetensors",
|
||||||
|
"model.layers.0.mlp.down_proj.weight": "model-00001-of-00014.safetensors",
|
||||||
|
"model.layers.0.mlp.gate_proj.weight": "model-00001-of-00014.safetensors",
|
||||||
|
"model.layers.0.mlp.up_proj.weight": "model-00001-of-00014.safetensors",
|
||||||
|
"model.layers.0.post_attention_layernorm.weight": "model-00001-of-00014.safetensors",
|
||||||
|
"model.layers.0.self_attn.k_norm.weight": "model-00001-of-00014.safetensors",
|
||||||
|
"model.layers.0.self_attn.k_proj.weight": "model-00001-of-00014.safetensors",
|
||||||
|
"model.layers.0.self_attn.o_proj.weight": "model-00001-of-00014.safetensors",
|
||||||
|
"model.layers.0.self_attn.q_norm.weight": "model-00001-of-00014.safetensors",
|
||||||
|
"model.layers.0.self_attn.q_proj.weight": "model-00001-of-00014.safetensors",
|
||||||
|
"model.layers.0.self_attn.v_proj.weight": "model-00001-of-00014.safetensors",
|
||||||
|
"model.layers.1.input_layernorm.weight": "model-00001-of-00014.safetensors",
|
||||||
|
"model.layers.1.mlp.down_proj.weight": "model-00001-of-00014.safetensors",
|
||||||
|
"model.layers.1.mlp.gate_proj.weight": "model-00001-of-00014.safetensors",
|
||||||
|
"model.layers.1.mlp.up_proj.weight": "model-00001-of-00014.safetensors",
|
||||||
|
"model.layers.1.post_attention_layernorm.weight": "model-00001-of-00014.safetensors",
|
||||||
|
"model.layers.1.self_attn.k_norm.weight": "model-00001-of-00014.safetensors",
|
||||||
|
"model.layers.1.self_attn.k_proj.weight": "model-00001-of-00014.safetensors",
|
||||||
|
"model.layers.1.self_attn.o_proj.weight": "model-00001-of-00014.safetensors",
|
||||||
|
"model.layers.1.self_attn.q_norm.weight": "model-00001-of-00014.safetensors",
|
||||||
|
"model.layers.1.self_attn.q_proj.weight": "model-00001-of-00014.safetensors",
|
||||||
|
"model.layers.1.self_attn.v_proj.weight": "model-00001-of-00014.safetensors",
|
||||||
|
"model.layers.10.input_layernorm.weight": "model-00003-of-00014.safetensors",
|
||||||
|
"model.layers.10.mlp.down_proj.weight": "model-00003-of-00014.safetensors",
|
||||||
|
"model.layers.10.mlp.gate_proj.weight": "model-00003-of-00014.safetensors",
|
||||||
|
"model.layers.10.mlp.up_proj.weight": "model-00003-of-00014.safetensors",
|
||||||
|
"model.layers.10.post_attention_layernorm.weight": "model-00003-of-00014.safetensors",
|
||||||
|
"model.layers.10.self_attn.k_norm.weight": "model-00003-of-00014.safetensors",
|
||||||
|
"model.layers.10.self_attn.k_proj.weight": "model-00003-of-00014.safetensors",
|
||||||
|
"model.layers.10.self_attn.o_proj.weight": "model-00003-of-00014.safetensors",
|
||||||
|
"model.layers.10.self_attn.q_norm.weight": "model-00003-of-00014.safetensors",
|
||||||
|
"model.layers.10.self_attn.q_proj.weight": "model-00003-of-00014.safetensors",
|
||||||
|
"model.layers.10.self_attn.v_proj.weight": "model-00003-of-00014.safetensors",
|
||||||
|
"model.layers.11.input_layernorm.weight": "model-00003-of-00014.safetensors",
|
||||||
|
"model.layers.11.mlp.down_proj.weight": "model-00003-of-00014.safetensors",
|
||||||
|
"model.layers.11.mlp.gate_proj.weight": "model-00003-of-00014.safetensors",
|
||||||
|
"model.layers.11.mlp.up_proj.weight": "model-00003-of-00014.safetensors",
|
||||||
|
"model.layers.11.post_attention_layernorm.weight": "model-00003-of-00014.safetensors",
|
||||||
|
"model.layers.11.self_attn.k_norm.weight": "model-00003-of-00014.safetensors",
|
||||||
|
"model.layers.11.self_attn.k_proj.weight": "model-00003-of-00014.safetensors",
|
||||||
|
"model.layers.11.self_attn.o_proj.weight": "model-00003-of-00014.safetensors",
|
||||||
|
"model.layers.11.self_attn.q_norm.weight": "model-00003-of-00014.safetensors",
|
||||||
|
"model.layers.11.self_attn.q_proj.weight": "model-00003-of-00014.safetensors",
|
||||||
|
"model.layers.11.self_attn.v_proj.weight": "model-00003-of-00014.safetensors",
|
||||||
|
"model.layers.12.input_layernorm.weight": "model-00003-of-00014.safetensors",
|
||||||
|
"model.layers.12.mlp.down_proj.weight": "model-00003-of-00014.safetensors",
|
||||||
|
"model.layers.12.mlp.gate_proj.weight": "model-00003-of-00014.safetensors",
|
||||||
|
"model.layers.12.mlp.up_proj.weight": "model-00003-of-00014.safetensors",
|
||||||
|
"model.layers.12.post_attention_layernorm.weight": "model-00003-of-00014.safetensors",
|
||||||
|
"model.layers.12.self_attn.k_norm.weight": "model-00003-of-00014.safetensors",
|
||||||
|
"model.layers.12.self_attn.k_proj.weight": "model-00003-of-00014.safetensors",
|
||||||
|
"model.layers.12.self_attn.o_proj.weight": "model-00003-of-00014.safetensors",
|
||||||
|
"model.layers.12.self_attn.q_norm.weight": "model-00003-of-00014.safetensors",
|
||||||
|
"model.layers.12.self_attn.q_proj.weight": "model-00003-of-00014.safetensors",
|
||||||
|
"model.layers.12.self_attn.v_proj.weight": "model-00003-of-00014.safetensors",
|
||||||
|
"model.layers.13.input_layernorm.weight": "model-00004-of-00014.safetensors",
|
||||||
|
"model.layers.13.mlp.down_proj.weight": "model-00004-of-00014.safetensors",
|
||||||
|
"model.layers.13.mlp.gate_proj.weight": "model-00003-of-00014.safetensors",
|
||||||
|
"model.layers.13.mlp.up_proj.weight": "model-00004-of-00014.safetensors",
|
||||||
|
"model.layers.13.post_attention_layernorm.weight": "model-00004-of-00014.safetensors",
|
||||||
|
"model.layers.13.self_attn.k_norm.weight": "model-00003-of-00014.safetensors",
|
||||||
|
"model.layers.13.self_attn.k_proj.weight": "model-00003-of-00014.safetensors",
|
||||||
|
"model.layers.13.self_attn.o_proj.weight": "model-00003-of-00014.safetensors",
|
||||||
|
"model.layers.13.self_attn.q_norm.weight": "model-00003-of-00014.safetensors",
|
||||||
|
"model.layers.13.self_attn.q_proj.weight": "model-00003-of-00014.safetensors",
|
||||||
|
"model.layers.13.self_attn.v_proj.weight": "model-00003-of-00014.safetensors",
|
||||||
|
"model.layers.14.input_layernorm.weight": "model-00004-of-00014.safetensors",
|
||||||
|
"model.layers.14.mlp.down_proj.weight": "model-00004-of-00014.safetensors",
|
||||||
|
"model.layers.14.mlp.gate_proj.weight": "model-00004-of-00014.safetensors",
|
||||||
|
"model.layers.14.mlp.up_proj.weight": "model-00004-of-00014.safetensors",
|
||||||
|
"model.layers.14.post_attention_layernorm.weight": "model-00004-of-00014.safetensors",
|
||||||
|
"model.layers.14.self_attn.k_norm.weight": "model-00004-of-00014.safetensors",
|
||||||
|
"model.layers.14.self_attn.k_proj.weight": "model-00004-of-00014.safetensors",
|
||||||
|
"model.layers.14.self_attn.o_proj.weight": "model-00004-of-00014.safetensors",
|
||||||
|
"model.layers.14.self_attn.q_norm.weight": "model-00004-of-00014.safetensors",
|
||||||
|
"model.layers.14.self_attn.q_proj.weight": "model-00004-of-00014.safetensors",
|
||||||
|
"model.layers.14.self_attn.v_proj.weight": "model-00004-of-00014.safetensors",
|
||||||
|
"model.layers.15.input_layernorm.weight": "model-00004-of-00014.safetensors",
|
||||||
|
"model.layers.15.mlp.down_proj.weight": "model-00004-of-00014.safetensors",
|
||||||
|
"model.layers.15.mlp.gate_proj.weight": "model-00004-of-00014.safetensors",
|
||||||
|
"model.layers.15.mlp.up_proj.weight": "model-00004-of-00014.safetensors",
|
||||||
|
"model.layers.15.post_attention_layernorm.weight": "model-00004-of-00014.safetensors",
|
||||||
|
"model.layers.15.self_attn.k_norm.weight": "model-00004-of-00014.safetensors",
|
||||||
|
"model.layers.15.self_attn.k_proj.weight": "model-00004-of-00014.safetensors",
|
||||||
|
"model.layers.15.self_attn.o_proj.weight": "model-00004-of-00014.safetensors",
|
||||||
|
"model.layers.15.self_attn.q_norm.weight": "model-00004-of-00014.safetensors",
|
||||||
|
"model.layers.15.self_attn.q_proj.weight": "model-00004-of-00014.safetensors",
|
||||||
|
"model.layers.15.self_attn.v_proj.weight": "model-00004-of-00014.safetensors",
|
||||||
|
"model.layers.16.input_layernorm.weight": "model-00004-of-00014.safetensors",
|
||||||
|
"model.layers.16.mlp.down_proj.weight": "model-00004-of-00014.safetensors",
|
||||||
|
"model.layers.16.mlp.gate_proj.weight": "model-00004-of-00014.safetensors",
|
||||||
|
"model.layers.16.mlp.up_proj.weight": "model-00004-of-00014.safetensors",
|
||||||
|
"model.layers.16.post_attention_layernorm.weight": "model-00004-of-00014.safetensors",
|
||||||
|
"model.layers.16.self_attn.k_norm.weight": "model-00004-of-00014.safetensors",
|
||||||
|
"model.layers.16.self_attn.k_proj.weight": "model-00004-of-00014.safetensors",
|
||||||
|
"model.layers.16.self_attn.o_proj.weight": "model-00004-of-00014.safetensors",
|
||||||
|
"model.layers.16.self_attn.q_norm.weight": "model-00004-of-00014.safetensors",
|
||||||
|
"model.layers.16.self_attn.q_proj.weight": "model-00004-of-00014.safetensors",
|
||||||
|
"model.layers.16.self_attn.v_proj.weight": "model-00004-of-00014.safetensors",
|
||||||
|
"model.layers.17.input_layernorm.weight": "model-00004-of-00014.safetensors",
|
||||||
|
"model.layers.17.mlp.down_proj.weight": "model-00004-of-00014.safetensors",
|
||||||
|
"model.layers.17.mlp.gate_proj.weight": "model-00004-of-00014.safetensors",
|
||||||
|
"model.layers.17.mlp.up_proj.weight": "model-00004-of-00014.safetensors",
|
||||||
|
"model.layers.17.post_attention_layernorm.weight": "model-00004-of-00014.safetensors",
|
||||||
|
"model.layers.17.self_attn.k_norm.weight": "model-00004-of-00014.safetensors",
|
||||||
|
"model.layers.17.self_attn.k_proj.weight": "model-00004-of-00014.safetensors",
|
||||||
|
"model.layers.17.self_attn.o_proj.weight": "model-00004-of-00014.safetensors",
|
||||||
|
"model.layers.17.self_attn.q_norm.weight": "model-00004-of-00014.safetensors",
|
||||||
|
"model.layers.17.self_attn.q_proj.weight": "model-00004-of-00014.safetensors",
|
||||||
|
"model.layers.17.self_attn.v_proj.weight": "model-00004-of-00014.safetensors",
|
||||||
|
"model.layers.18.input_layernorm.weight": "model-00005-of-00014.safetensors",
|
||||||
|
"model.layers.18.mlp.down_proj.weight": "model-00005-of-00014.safetensors",
|
||||||
|
"model.layers.18.mlp.gate_proj.weight": "model-00004-of-00014.safetensors",
|
||||||
|
"model.layers.18.mlp.up_proj.weight": "model-00005-of-00014.safetensors",
|
||||||
|
"model.layers.18.post_attention_layernorm.weight": "model-00005-of-00014.safetensors",
|
||||||
|
"model.layers.18.self_attn.k_norm.weight": "model-00004-of-00014.safetensors",
|
||||||
|
"model.layers.18.self_attn.k_proj.weight": "model-00004-of-00014.safetensors",
|
||||||
|
"model.layers.18.self_attn.o_proj.weight": "model-00004-of-00014.safetensors",
|
||||||
|
"model.layers.18.self_attn.q_norm.weight": "model-00004-of-00014.safetensors",
|
||||||
|
"model.layers.18.self_attn.q_proj.weight": "model-00004-of-00014.safetensors",
|
||||||
|
"model.layers.18.self_attn.v_proj.weight": "model-00004-of-00014.safetensors",
|
||||||
|
"model.layers.19.input_layernorm.weight": "model-00005-of-00014.safetensors",
|
||||||
|
"model.layers.19.mlp.down_proj.weight": "model-00005-of-00014.safetensors",
|
||||||
|
"model.layers.19.mlp.gate_proj.weight": "model-00005-of-00014.safetensors",
|
||||||
|
"model.layers.19.mlp.up_proj.weight": "model-00005-of-00014.safetensors",
|
||||||
|
"model.layers.19.post_attention_layernorm.weight": "model-00005-of-00014.safetensors",
|
||||||
|
"model.layers.19.self_attn.k_norm.weight": "model-00005-of-00014.safetensors",
|
||||||
|
"model.layers.19.self_attn.k_proj.weight": "model-00005-of-00014.safetensors",
|
||||||
|
"model.layers.19.self_attn.o_proj.weight": "model-00005-of-00014.safetensors",
|
||||||
|
"model.layers.19.self_attn.q_norm.weight": "model-00005-of-00014.safetensors",
|
||||||
|
"model.layers.19.self_attn.q_proj.weight": "model-00005-of-00014.safetensors",
|
||||||
|
"model.layers.19.self_attn.v_proj.weight": "model-00005-of-00014.safetensors",
|
||||||
|
"model.layers.2.input_layernorm.weight": "model-00001-of-00014.safetensors",
|
||||||
|
"model.layers.2.mlp.down_proj.weight": "model-00001-of-00014.safetensors",
|
||||||
|
"model.layers.2.mlp.gate_proj.weight": "model-00001-of-00014.safetensors",
|
||||||
|
"model.layers.2.mlp.up_proj.weight": "model-00001-of-00014.safetensors",
|
||||||
|
"model.layers.2.post_attention_layernorm.weight": "model-00001-of-00014.safetensors",
|
||||||
|
"model.layers.2.self_attn.k_norm.weight": "model-00001-of-00014.safetensors",
|
||||||
|
"model.layers.2.self_attn.k_proj.weight": "model-00001-of-00014.safetensors",
|
||||||
|
"model.layers.2.self_attn.o_proj.weight": "model-00001-of-00014.safetensors",
|
||||||
|
"model.layers.2.self_attn.q_norm.weight": "model-00001-of-00014.safetensors",
|
||||||
|
"model.layers.2.self_attn.q_proj.weight": "model-00001-of-00014.safetensors",
|
||||||
|
"model.layers.2.self_attn.v_proj.weight": "model-00001-of-00014.safetensors",
|
||||||
|
"model.layers.20.input_layernorm.weight": "model-00005-of-00014.safetensors",
|
||||||
|
"model.layers.20.mlp.down_proj.weight": "model-00005-of-00014.safetensors",
|
||||||
|
"model.layers.20.mlp.gate_proj.weight": "model-00005-of-00014.safetensors",
|
||||||
|
"model.layers.20.mlp.up_proj.weight": "model-00005-of-00014.safetensors",
|
||||||
|
"model.layers.20.post_attention_layernorm.weight": "model-00005-of-00014.safetensors",
|
||||||
|
"model.layers.20.self_attn.k_norm.weight": "model-00005-of-00014.safetensors",
|
||||||
|
"model.layers.20.self_attn.k_proj.weight": "model-00005-of-00014.safetensors",
|
||||||
|
"model.layers.20.self_attn.o_proj.weight": "model-00005-of-00014.safetensors",
|
||||||
|
"model.layers.20.self_attn.q_norm.weight": "model-00005-of-00014.safetensors",
|
||||||
|
"model.layers.20.self_attn.q_proj.weight": "model-00005-of-00014.safetensors",
|
||||||
|
"model.layers.20.self_attn.v_proj.weight": "model-00005-of-00014.safetensors",
|
||||||
|
"model.layers.21.input_layernorm.weight": "model-00005-of-00014.safetensors",
|
||||||
|
"model.layers.21.mlp.down_proj.weight": "model-00005-of-00014.safetensors",
|
||||||
|
"model.layers.21.mlp.gate_proj.weight": "model-00005-of-00014.safetensors",
|
||||||
|
"model.layers.21.mlp.up_proj.weight": "model-00005-of-00014.safetensors",
|
||||||
|
"model.layers.21.post_attention_layernorm.weight": "model-00005-of-00014.safetensors",
|
||||||
|
"model.layers.21.self_attn.k_norm.weight": "model-00005-of-00014.safetensors",
|
||||||
|
"model.layers.21.self_attn.k_proj.weight": "model-00005-of-00014.safetensors",
|
||||||
|
"model.layers.21.self_attn.o_proj.weight": "model-00005-of-00014.safetensors",
|
||||||
|
"model.layers.21.self_attn.q_norm.weight": "model-00005-of-00014.safetensors",
|
||||||
|
"model.layers.21.self_attn.q_proj.weight": "model-00005-of-00014.safetensors",
|
||||||
|
"model.layers.21.self_attn.v_proj.weight": "model-00005-of-00014.safetensors",
|
||||||
|
"model.layers.22.input_layernorm.weight": "model-00005-of-00014.safetensors",
|
||||||
|
"model.layers.22.mlp.down_proj.weight": "model-00005-of-00014.safetensors",
|
||||||
|
"model.layers.22.mlp.gate_proj.weight": "model-00005-of-00014.safetensors",
|
||||||
|
"model.layers.22.mlp.up_proj.weight": "model-00005-of-00014.safetensors",
|
||||||
|
"model.layers.22.post_attention_layernorm.weight": "model-00005-of-00014.safetensors",
|
||||||
|
"model.layers.22.self_attn.k_norm.weight": "model-00005-of-00014.safetensors",
|
||||||
|
"model.layers.22.self_attn.k_proj.weight": "model-00005-of-00014.safetensors",
|
||||||
|
"model.layers.22.self_attn.o_proj.weight": "model-00005-of-00014.safetensors",
|
||||||
|
"model.layers.22.self_attn.q_norm.weight": "model-00005-of-00014.safetensors",
|
||||||
|
"model.layers.22.self_attn.q_proj.weight": "model-00005-of-00014.safetensors",
|
||||||
|
"model.layers.22.self_attn.v_proj.weight": "model-00005-of-00014.safetensors",
|
||||||
|
"model.layers.23.input_layernorm.weight": "model-00006-of-00014.safetensors",
|
||||||
|
"model.layers.23.mlp.down_proj.weight": "model-00006-of-00014.safetensors",
|
||||||
|
"model.layers.23.mlp.gate_proj.weight": "model-00005-of-00014.safetensors",
|
||||||
|
"model.layers.23.mlp.up_proj.weight": "model-00006-of-00014.safetensors",
|
||||||
|
"model.layers.23.post_attention_layernorm.weight": "model-00006-of-00014.safetensors",
|
||||||
|
"model.layers.23.self_attn.k_norm.weight": "model-00005-of-00014.safetensors",
|
||||||
|
"model.layers.23.self_attn.k_proj.weight": "model-00005-of-00014.safetensors",
|
||||||
|
"model.layers.23.self_attn.o_proj.weight": "model-00005-of-00014.safetensors",
|
||||||
|
"model.layers.23.self_attn.q_norm.weight": "model-00005-of-00014.safetensors",
|
||||||
|
"model.layers.23.self_attn.q_proj.weight": "model-00005-of-00014.safetensors",
|
||||||
|
"model.layers.23.self_attn.v_proj.weight": "model-00005-of-00014.safetensors",
|
||||||
|
"model.layers.24.input_layernorm.weight": "model-00006-of-00014.safetensors",
|
||||||
|
"model.layers.24.mlp.down_proj.weight": "model-00006-of-00014.safetensors",
|
||||||
|
"model.layers.24.mlp.gate_proj.weight": "model-00006-of-00014.safetensors",
|
||||||
|
"model.layers.24.mlp.up_proj.weight": "model-00006-of-00014.safetensors",
|
||||||
|
"model.layers.24.post_attention_layernorm.weight": "model-00006-of-00014.safetensors",
|
||||||
|
"model.layers.24.self_attn.k_norm.weight": "model-00006-of-00014.safetensors",
|
||||||
|
"model.layers.24.self_attn.k_proj.weight": "model-00006-of-00014.safetensors",
|
||||||
|
"model.layers.24.self_attn.o_proj.weight": "model-00006-of-00014.safetensors",
|
||||||
|
"model.layers.24.self_attn.q_norm.weight": "model-00006-of-00014.safetensors",
|
||||||
|
"model.layers.24.self_attn.q_proj.weight": "model-00006-of-00014.safetensors",
|
||||||
|
"model.layers.24.self_attn.v_proj.weight": "model-00006-of-00014.safetensors",
|
||||||
|
"model.layers.25.input_layernorm.weight": "model-00006-of-00014.safetensors",
|
||||||
|
"model.layers.25.mlp.down_proj.weight": "model-00006-of-00014.safetensors",
|
||||||
|
"model.layers.25.mlp.gate_proj.weight": "model-00006-of-00014.safetensors",
|
||||||
|
"model.layers.25.mlp.up_proj.weight": "model-00006-of-00014.safetensors",
|
||||||
|
"model.layers.25.post_attention_layernorm.weight": "model-00006-of-00014.safetensors",
|
||||||
|
"model.layers.25.self_attn.k_norm.weight": "model-00006-of-00014.safetensors",
|
||||||
|
"model.layers.25.self_attn.k_proj.weight": "model-00006-of-00014.safetensors",
|
||||||
|
"model.layers.25.self_attn.o_proj.weight": "model-00006-of-00014.safetensors",
|
||||||
|
"model.layers.25.self_attn.q_norm.weight": "model-00006-of-00014.safetensors",
|
||||||
|
"model.layers.25.self_attn.q_proj.weight": "model-00006-of-00014.safetensors",
|
||||||
|
"model.layers.25.self_attn.v_proj.weight": "model-00006-of-00014.safetensors",
|
||||||
|
"model.layers.26.input_layernorm.weight": "model-00006-of-00014.safetensors",
|
||||||
|
"model.layers.26.mlp.down_proj.weight": "model-00006-of-00014.safetensors",
|
||||||
|
"model.layers.26.mlp.gate_proj.weight": "model-00006-of-00014.safetensors",
|
||||||
|
"model.layers.26.mlp.up_proj.weight": "model-00006-of-00014.safetensors",
|
||||||
|
"model.layers.26.post_attention_layernorm.weight": "model-00006-of-00014.safetensors",
|
||||||
|
"model.layers.26.self_attn.k_norm.weight": "model-00006-of-00014.safetensors",
|
||||||
|
"model.layers.26.self_attn.k_proj.weight": "model-00006-of-00014.safetensors",
|
||||||
|
"model.layers.26.self_attn.o_proj.weight": "model-00006-of-00014.safetensors",
|
||||||
|
"model.layers.26.self_attn.q_norm.weight": "model-00006-of-00014.safetensors",
|
||||||
|
"model.layers.26.self_attn.q_proj.weight": "model-00006-of-00014.safetensors",
|
||||||
|
"model.layers.26.self_attn.v_proj.weight": "model-00006-of-00014.safetensors",
|
||||||
|
"model.layers.27.input_layernorm.weight": "model-00006-of-00014.safetensors",
|
||||||
|
"model.layers.27.mlp.down_proj.weight": "model-00006-of-00014.safetensors",
|
||||||
|
"model.layers.27.mlp.gate_proj.weight": "model-00006-of-00014.safetensors",
|
||||||
|
"model.layers.27.mlp.up_proj.weight": "model-00006-of-00014.safetensors",
|
||||||
|
"model.layers.27.post_attention_layernorm.weight": "model-00006-of-00014.safetensors",
|
||||||
|
"model.layers.27.self_attn.k_norm.weight": "model-00006-of-00014.safetensors",
|
||||||
|
"model.layers.27.self_attn.k_proj.weight": "model-00006-of-00014.safetensors",
|
||||||
|
"model.layers.27.self_attn.o_proj.weight": "model-00006-of-00014.safetensors",
|
||||||
|
"model.layers.27.self_attn.q_norm.weight": "model-00006-of-00014.safetensors",
|
||||||
|
"model.layers.27.self_attn.q_proj.weight": "model-00006-of-00014.safetensors",
|
||||||
|
"model.layers.27.self_attn.v_proj.weight": "model-00006-of-00014.safetensors",
|
||||||
|
"model.layers.28.input_layernorm.weight": "model-00007-of-00014.safetensors",
|
||||||
|
"model.layers.28.mlp.down_proj.weight": "model-00007-of-00014.safetensors",
|
||||||
|
"model.layers.28.mlp.gate_proj.weight": "model-00006-of-00014.safetensors",
|
||||||
|
"model.layers.28.mlp.up_proj.weight": "model-00007-of-00014.safetensors",
|
||||||
|
"model.layers.28.post_attention_layernorm.weight": "model-00007-of-00014.safetensors",
|
||||||
|
"model.layers.28.self_attn.k_norm.weight": "model-00006-of-00014.safetensors",
|
||||||
|
"model.layers.28.self_attn.k_proj.weight": "model-00006-of-00014.safetensors",
|
||||||
|
"model.layers.28.self_attn.o_proj.weight": "model-00006-of-00014.safetensors",
|
||||||
|
"model.layers.28.self_attn.q_norm.weight": "model-00006-of-00014.safetensors",
|
||||||
|
"model.layers.28.self_attn.q_proj.weight": "model-00006-of-00014.safetensors",
|
||||||
|
"model.layers.28.self_attn.v_proj.weight": "model-00006-of-00014.safetensors",
|
||||||
|
"model.layers.29.input_layernorm.weight": "model-00007-of-00014.safetensors",
|
||||||
|
"model.layers.29.mlp.down_proj.weight": "model-00007-of-00014.safetensors",
|
||||||
|
"model.layers.29.mlp.gate_proj.weight": "model-00007-of-00014.safetensors",
|
||||||
|
"model.layers.29.mlp.up_proj.weight": "model-00007-of-00014.safetensors",
|
||||||
|
"model.layers.29.post_attention_layernorm.weight": "model-00007-of-00014.safetensors",
|
||||||
|
"model.layers.29.self_attn.k_norm.weight": "model-00007-of-00014.safetensors",
|
||||||
|
"model.layers.29.self_attn.k_proj.weight": "model-00007-of-00014.safetensors",
|
||||||
|
"model.layers.29.self_attn.o_proj.weight": "model-00007-of-00014.safetensors",
|
||||||
|
"model.layers.29.self_attn.q_norm.weight": "model-00007-of-00014.safetensors",
|
||||||
|
"model.layers.29.self_attn.q_proj.weight": "model-00007-of-00014.safetensors",
|
||||||
|
"model.layers.29.self_attn.v_proj.weight": "model-00007-of-00014.safetensors",
|
||||||
|
"model.layers.3.input_layernorm.weight": "model-00002-of-00014.safetensors",
|
||||||
|
"model.layers.3.mlp.down_proj.weight": "model-00002-of-00014.safetensors",
|
||||||
|
"model.layers.3.mlp.gate_proj.weight": "model-00001-of-00014.safetensors",
|
||||||
|
"model.layers.3.mlp.up_proj.weight": "model-00002-of-00014.safetensors",
|
||||||
|
"model.layers.3.post_attention_layernorm.weight": "model-00002-of-00014.safetensors",
|
||||||
|
"model.layers.3.self_attn.k_norm.weight": "model-00001-of-00014.safetensors",
|
||||||
|
"model.layers.3.self_attn.k_proj.weight": "model-00001-of-00014.safetensors",
|
||||||
|
"model.layers.3.self_attn.o_proj.weight": "model-00001-of-00014.safetensors",
|
||||||
|
"model.layers.3.self_attn.q_norm.weight": "model-00001-of-00014.safetensors",
|
||||||
|
"model.layers.3.self_attn.q_proj.weight": "model-00001-of-00014.safetensors",
|
||||||
|
"model.layers.3.self_attn.v_proj.weight": "model-00001-of-00014.safetensors",
|
||||||
|
"model.layers.30.input_layernorm.weight": "model-00007-of-00014.safetensors",
|
||||||
|
"model.layers.30.mlp.down_proj.weight": "model-00007-of-00014.safetensors",
|
||||||
|
"model.layers.30.mlp.gate_proj.weight": "model-00007-of-00014.safetensors",
|
||||||
|
"model.layers.30.mlp.up_proj.weight": "model-00007-of-00014.safetensors",
|
||||||
|
"model.layers.30.post_attention_layernorm.weight": "model-00007-of-00014.safetensors",
|
||||||
|
"model.layers.30.self_attn.k_norm.weight": "model-00007-of-00014.safetensors",
|
||||||
|
"model.layers.30.self_attn.k_proj.weight": "model-00007-of-00014.safetensors",
|
||||||
|
"model.layers.30.self_attn.o_proj.weight": "model-00007-of-00014.safetensors",
|
||||||
|
"model.layers.30.self_attn.q_norm.weight": "model-00007-of-00014.safetensors",
|
||||||
|
"model.layers.30.self_attn.q_proj.weight": "model-00007-of-00014.safetensors",
|
||||||
|
"model.layers.30.self_attn.v_proj.weight": "model-00007-of-00014.safetensors",
|
||||||
|
"model.layers.31.input_layernorm.weight": "model-00007-of-00014.safetensors",
|
||||||
|
"model.layers.31.mlp.down_proj.weight": "model-00007-of-00014.safetensors",
|
||||||
|
"model.layers.31.mlp.gate_proj.weight": "model-00007-of-00014.safetensors",
|
||||||
|
"model.layers.31.mlp.up_proj.weight": "model-00007-of-00014.safetensors",
|
||||||
|
"model.layers.31.post_attention_layernorm.weight": "model-00007-of-00014.safetensors",
|
||||||
|
"model.layers.31.self_attn.k_norm.weight": "model-00007-of-00014.safetensors",
|
||||||
|
"model.layers.31.self_attn.k_proj.weight": "model-00007-of-00014.safetensors",
|
||||||
|
"model.layers.31.self_attn.o_proj.weight": "model-00007-of-00014.safetensors",
|
||||||
|
"model.layers.31.self_attn.q_norm.weight": "model-00007-of-00014.safetensors",
|
||||||
|
"model.layers.31.self_attn.q_proj.weight": "model-00007-of-00014.safetensors",
|
||||||
|
"model.layers.31.self_attn.v_proj.weight": "model-00007-of-00014.safetensors",
|
||||||
|
"model.layers.32.input_layernorm.weight": "model-00007-of-00014.safetensors",
|
||||||
|
"model.layers.32.mlp.down_proj.weight": "model-00007-of-00014.safetensors",
|
||||||
|
"model.layers.32.mlp.gate_proj.weight": "model-00007-of-00014.safetensors",
|
||||||
|
"model.layers.32.mlp.up_proj.weight": "model-00007-of-00014.safetensors",
|
||||||
|
"model.layers.32.post_attention_layernorm.weight": "model-00007-of-00014.safetensors",
|
||||||
|
"model.layers.32.self_attn.k_norm.weight": "model-00007-of-00014.safetensors",
|
||||||
|
"model.layers.32.self_attn.k_proj.weight": "model-00007-of-00014.safetensors",
|
||||||
|
"model.layers.32.self_attn.o_proj.weight": "model-00007-of-00014.safetensors",
|
||||||
|
"model.layers.32.self_attn.q_norm.weight": "model-00007-of-00014.safetensors",
|
||||||
|
"model.layers.32.self_attn.q_proj.weight": "model-00007-of-00014.safetensors",
|
||||||
|
"model.layers.32.self_attn.v_proj.weight": "model-00007-of-00014.safetensors",
|
||||||
|
"model.layers.33.input_layernorm.weight": "model-00008-of-00014.safetensors",
|
||||||
|
"model.layers.33.mlp.down_proj.weight": "model-00008-of-00014.safetensors",
|
||||||
|
"model.layers.33.mlp.gate_proj.weight": "model-00007-of-00014.safetensors",
|
||||||
|
"model.layers.33.mlp.up_proj.weight": "model-00008-of-00014.safetensors",
|
||||||
|
"model.layers.33.post_attention_layernorm.weight": "model-00008-of-00014.safetensors",
|
||||||
|
"model.layers.33.self_attn.k_norm.weight": "model-00007-of-00014.safetensors",
|
||||||
|
"model.layers.33.self_attn.k_proj.weight": "model-00007-of-00014.safetensors",
|
||||||
|
"model.layers.33.self_attn.o_proj.weight": "model-00007-of-00014.safetensors",
|
||||||
|
"model.layers.33.self_attn.q_norm.weight": "model-00007-of-00014.safetensors",
|
||||||
|
"model.layers.33.self_attn.q_proj.weight": "model-00007-of-00014.safetensors",
|
||||||
|
"model.layers.33.self_attn.v_proj.weight": "model-00007-of-00014.safetensors",
|
||||||
|
"model.layers.34.input_layernorm.weight": "model-00008-of-00014.safetensors",
|
||||||
|
"model.layers.34.mlp.down_proj.weight": "model-00008-of-00014.safetensors",
|
||||||
|
"model.layers.34.mlp.gate_proj.weight": "model-00008-of-00014.safetensors",
|
||||||
|
"model.layers.34.mlp.up_proj.weight": "model-00008-of-00014.safetensors",
|
||||||
|
"model.layers.34.post_attention_layernorm.weight": "model-00008-of-00014.safetensors",
|
||||||
|
"model.layers.34.self_attn.k_norm.weight": "model-00008-of-00014.safetensors",
|
||||||
|
"model.layers.34.self_attn.k_proj.weight": "model-00008-of-00014.safetensors",
|
||||||
|
"model.layers.34.self_attn.o_proj.weight": "model-00008-of-00014.safetensors",
|
||||||
|
"model.layers.34.self_attn.q_norm.weight": "model-00008-of-00014.safetensors",
|
||||||
|
"model.layers.34.self_attn.q_proj.weight": "model-00008-of-00014.safetensors",
|
||||||
|
"model.layers.34.self_attn.v_proj.weight": "model-00008-of-00014.safetensors",
|
||||||
|
"model.layers.35.input_layernorm.weight": "model-00008-of-00014.safetensors",
|
||||||
|
"model.layers.35.mlp.down_proj.weight": "model-00008-of-00014.safetensors",
|
||||||
|
"model.layers.35.mlp.gate_proj.weight": "model-00008-of-00014.safetensors",
|
||||||
|
"model.layers.35.mlp.up_proj.weight": "model-00008-of-00014.safetensors",
|
||||||
|
"model.layers.35.post_attention_layernorm.weight": "model-00008-of-00014.safetensors",
|
||||||
|
"model.layers.35.self_attn.k_norm.weight": "model-00008-of-00014.safetensors",
|
||||||
|
"model.layers.35.self_attn.k_proj.weight": "model-00008-of-00014.safetensors",
|
||||||
|
"model.layers.35.self_attn.o_proj.weight": "model-00008-of-00014.safetensors",
|
||||||
|
"model.layers.35.self_attn.q_norm.weight": "model-00008-of-00014.safetensors",
|
||||||
|
"model.layers.35.self_attn.q_proj.weight": "model-00008-of-00014.safetensors",
|
||||||
|
"model.layers.35.self_attn.v_proj.weight": "model-00008-of-00014.safetensors",
|
||||||
|
"model.layers.36.input_layernorm.weight": "model-00008-of-00014.safetensors",
|
||||||
|
"model.layers.36.mlp.down_proj.weight": "model-00008-of-00014.safetensors",
|
||||||
|
"model.layers.36.mlp.gate_proj.weight": "model-00008-of-00014.safetensors",
|
||||||
|
"model.layers.36.mlp.up_proj.weight": "model-00008-of-00014.safetensors",
|
||||||
|
"model.layers.36.post_attention_layernorm.weight": "model-00008-of-00014.safetensors",
|
||||||
|
"model.layers.36.self_attn.k_norm.weight": "model-00008-of-00014.safetensors",
|
||||||
|
"model.layers.36.self_attn.k_proj.weight": "model-00008-of-00014.safetensors",
|
||||||
|
"model.layers.36.self_attn.o_proj.weight": "model-00008-of-00014.safetensors",
|
||||||
|
"model.layers.36.self_attn.q_norm.weight": "model-00008-of-00014.safetensors",
|
||||||
|
"model.layers.36.self_attn.q_proj.weight": "model-00008-of-00014.safetensors",
|
||||||
|
"model.layers.36.self_attn.v_proj.weight": "model-00008-of-00014.safetensors",
|
||||||
|
"model.layers.37.input_layernorm.weight": "model-00008-of-00014.safetensors",
|
||||||
|
"model.layers.37.mlp.down_proj.weight": "model-00008-of-00014.safetensors",
|
||||||
|
"model.layers.37.mlp.gate_proj.weight": "model-00008-of-00014.safetensors",
|
||||||
|
"model.layers.37.mlp.up_proj.weight": "model-00008-of-00014.safetensors",
|
||||||
|
"model.layers.37.post_attention_layernorm.weight": "model-00008-of-00014.safetensors",
|
||||||
|
"model.layers.37.self_attn.k_norm.weight": "model-00008-of-00014.safetensors",
|
||||||
|
"model.layers.37.self_attn.k_proj.weight": "model-00008-of-00014.safetensors",
|
||||||
|
"model.layers.37.self_attn.o_proj.weight": "model-00008-of-00014.safetensors",
|
||||||
|
"model.layers.37.self_attn.q_norm.weight": "model-00008-of-00014.safetensors",
|
||||||
|
"model.layers.37.self_attn.q_proj.weight": "model-00008-of-00014.safetensors",
|
||||||
|
"model.layers.37.self_attn.v_proj.weight": "model-00008-of-00014.safetensors",
|
||||||
|
"model.layers.38.input_layernorm.weight": "model-00009-of-00014.safetensors",
|
||||||
|
"model.layers.38.mlp.down_proj.weight": "model-00009-of-00014.safetensors",
|
||||||
|
"model.layers.38.mlp.gate_proj.weight": "model-00008-of-00014.safetensors",
|
||||||
|
"model.layers.38.mlp.up_proj.weight": "model-00009-of-00014.safetensors",
|
||||||
|
"model.layers.38.post_attention_layernorm.weight": "model-00009-of-00014.safetensors",
|
||||||
|
"model.layers.38.self_attn.k_norm.weight": "model-00008-of-00014.safetensors",
|
||||||
|
"model.layers.38.self_attn.k_proj.weight": "model-00008-of-00014.safetensors",
|
||||||
|
"model.layers.38.self_attn.o_proj.weight": "model-00008-of-00014.safetensors",
|
||||||
|
"model.layers.38.self_attn.q_norm.weight": "model-00008-of-00014.safetensors",
|
||||||
|
"model.layers.38.self_attn.q_proj.weight": "model-00008-of-00014.safetensors",
|
||||||
|
"model.layers.38.self_attn.v_proj.weight": "model-00008-of-00014.safetensors",
|
||||||
|
"model.layers.39.input_layernorm.weight": "model-00009-of-00014.safetensors",
|
||||||
|
"model.layers.39.mlp.down_proj.weight": "model-00009-of-00014.safetensors",
|
||||||
|
"model.layers.39.mlp.gate_proj.weight": "model-00009-of-00014.safetensors",
|
||||||
|
"model.layers.39.mlp.up_proj.weight": "model-00009-of-00014.safetensors",
|
||||||
|
"model.layers.39.post_attention_layernorm.weight": "model-00009-of-00014.safetensors",
|
||||||
|
"model.layers.39.self_attn.k_norm.weight": "model-00009-of-00014.safetensors",
|
||||||
|
"model.layers.39.self_attn.k_proj.weight": "model-00009-of-00014.safetensors",
|
||||||
|
"model.layers.39.self_attn.o_proj.weight": "model-00009-of-00014.safetensors",
|
||||||
|
"model.layers.39.self_attn.q_norm.weight": "model-00009-of-00014.safetensors",
|
||||||
|
"model.layers.39.self_attn.q_proj.weight": "model-00009-of-00014.safetensors",
|
||||||
|
"model.layers.39.self_attn.v_proj.weight": "model-00009-of-00014.safetensors",
|
||||||
|
"model.layers.4.input_layernorm.weight": "model-00002-of-00014.safetensors",
|
||||||
|
"model.layers.4.mlp.down_proj.weight": "model-00002-of-00014.safetensors",
|
||||||
|
"model.layers.4.mlp.gate_proj.weight": "model-00002-of-00014.safetensors",
|
||||||
|
"model.layers.4.mlp.up_proj.weight": "model-00002-of-00014.safetensors",
|
||||||
|
"model.layers.4.post_attention_layernorm.weight": "model-00002-of-00014.safetensors",
|
||||||
|
"model.layers.4.self_attn.k_norm.weight": "model-00002-of-00014.safetensors",
|
||||||
|
"model.layers.4.self_attn.k_proj.weight": "model-00002-of-00014.safetensors",
|
||||||
|
"model.layers.4.self_attn.o_proj.weight": "model-00002-of-00014.safetensors",
|
||||||
|
"model.layers.4.self_attn.q_norm.weight": "model-00002-of-00014.safetensors",
|
||||||
|
"model.layers.4.self_attn.q_proj.weight": "model-00002-of-00014.safetensors",
|
||||||
|
"model.layers.4.self_attn.v_proj.weight": "model-00002-of-00014.safetensors",
|
||||||
|
"model.layers.40.input_layernorm.weight": "model-00009-of-00014.safetensors",
|
||||||
|
"model.layers.40.mlp.down_proj.weight": "model-00009-of-00014.safetensors",
|
||||||
|
"model.layers.40.mlp.gate_proj.weight": "model-00009-of-00014.safetensors",
|
||||||
|
"model.layers.40.mlp.up_proj.weight": "model-00009-of-00014.safetensors",
|
||||||
|
"model.layers.40.post_attention_layernorm.weight": "model-00009-of-00014.safetensors",
|
||||||
|
"model.layers.40.self_attn.k_norm.weight": "model-00009-of-00014.safetensors",
|
||||||
|
"model.layers.40.self_attn.k_proj.weight": "model-00009-of-00014.safetensors",
|
||||||
|
"model.layers.40.self_attn.o_proj.weight": "model-00009-of-00014.safetensors",
|
||||||
|
"model.layers.40.self_attn.q_norm.weight": "model-00009-of-00014.safetensors",
|
||||||
|
"model.layers.40.self_attn.q_proj.weight": "model-00009-of-00014.safetensors",
|
||||||
|
"model.layers.40.self_attn.v_proj.weight": "model-00009-of-00014.safetensors",
|
||||||
|
"model.layers.41.input_layernorm.weight": "model-00009-of-00014.safetensors",
|
||||||
|
"model.layers.41.mlp.down_proj.weight": "model-00009-of-00014.safetensors",
|
||||||
|
"model.layers.41.mlp.gate_proj.weight": "model-00009-of-00014.safetensors",
|
||||||
|
"model.layers.41.mlp.up_proj.weight": "model-00009-of-00014.safetensors",
|
||||||
|
"model.layers.41.post_attention_layernorm.weight": "model-00009-of-00014.safetensors",
|
||||||
|
"model.layers.41.self_attn.k_norm.weight": "model-00009-of-00014.safetensors",
|
||||||
|
"model.layers.41.self_attn.k_proj.weight": "model-00009-of-00014.safetensors",
|
||||||
|
"model.layers.41.self_attn.o_proj.weight": "model-00009-of-00014.safetensors",
|
||||||
|
"model.layers.41.self_attn.q_norm.weight": "model-00009-of-00014.safetensors",
|
||||||
|
"model.layers.41.self_attn.q_proj.weight": "model-00009-of-00014.safetensors",
|
||||||
|
"model.layers.41.self_attn.v_proj.weight": "model-00009-of-00014.safetensors",
|
||||||
|
"model.layers.42.input_layernorm.weight": "model-00009-of-00014.safetensors",
|
||||||
|
"model.layers.42.mlp.down_proj.weight": "model-00009-of-00014.safetensors",
|
||||||
|
"model.layers.42.mlp.gate_proj.weight": "model-00009-of-00014.safetensors",
|
||||||
|
"model.layers.42.mlp.up_proj.weight": "model-00009-of-00014.safetensors",
|
||||||
|
"model.layers.42.post_attention_layernorm.weight": "model-00009-of-00014.safetensors",
|
||||||
|
"model.layers.42.self_attn.k_norm.weight": "model-00009-of-00014.safetensors",
|
||||||
|
"model.layers.42.self_attn.k_proj.weight": "model-00009-of-00014.safetensors",
|
||||||
|
"model.layers.42.self_attn.o_proj.weight": "model-00009-of-00014.safetensors",
|
||||||
|
"model.layers.42.self_attn.q_norm.weight": "model-00009-of-00014.safetensors",
|
||||||
|
"model.layers.42.self_attn.q_proj.weight": "model-00009-of-00014.safetensors",
|
||||||
|
"model.layers.42.self_attn.v_proj.weight": "model-00009-of-00014.safetensors",
|
||||||
|
"model.layers.43.input_layernorm.weight": "model-00010-of-00014.safetensors",
|
||||||
|
"model.layers.43.mlp.down_proj.weight": "model-00010-of-00014.safetensors",
|
||||||
|
"model.layers.43.mlp.gate_proj.weight": "model-00009-of-00014.safetensors",
|
||||||
|
"model.layers.43.mlp.up_proj.weight": "model-00010-of-00014.safetensors",
|
||||||
|
"model.layers.43.post_attention_layernorm.weight": "model-00010-of-00014.safetensors",
|
||||||
|
"model.layers.43.self_attn.k_norm.weight": "model-00009-of-00014.safetensors",
|
||||||
|
"model.layers.43.self_attn.k_proj.weight": "model-00009-of-00014.safetensors",
|
||||||
|
"model.layers.43.self_attn.o_proj.weight": "model-00009-of-00014.safetensors",
|
||||||
|
"model.layers.43.self_attn.q_norm.weight": "model-00009-of-00014.safetensors",
|
||||||
|
"model.layers.43.self_attn.q_proj.weight": "model-00009-of-00014.safetensors",
|
||||||
|
"model.layers.43.self_attn.v_proj.weight": "model-00009-of-00014.safetensors",
|
||||||
|
"model.layers.44.input_layernorm.weight": "model-00010-of-00014.safetensors",
|
||||||
|
"model.layers.44.mlp.down_proj.weight": "model-00010-of-00014.safetensors",
|
||||||
|
"model.layers.44.mlp.gate_proj.weight": "model-00010-of-00014.safetensors",
|
||||||
|
"model.layers.44.mlp.up_proj.weight": "model-00010-of-00014.safetensors",
|
||||||
|
"model.layers.44.post_attention_layernorm.weight": "model-00010-of-00014.safetensors",
|
||||||
|
"model.layers.44.self_attn.k_norm.weight": "model-00010-of-00014.safetensors",
|
||||||
|
"model.layers.44.self_attn.k_proj.weight": "model-00010-of-00014.safetensors",
|
||||||
|
"model.layers.44.self_attn.o_proj.weight": "model-00010-of-00014.safetensors",
|
||||||
|
"model.layers.44.self_attn.q_norm.weight": "model-00010-of-00014.safetensors",
|
||||||
|
"model.layers.44.self_attn.q_proj.weight": "model-00010-of-00014.safetensors",
|
||||||
|
"model.layers.44.self_attn.v_proj.weight": "model-00010-of-00014.safetensors",
|
||||||
|
"model.layers.45.input_layernorm.weight": "model-00010-of-00014.safetensors",
|
||||||
|
"model.layers.45.mlp.down_proj.weight": "model-00010-of-00014.safetensors",
|
||||||
|
"model.layers.45.mlp.gate_proj.weight": "model-00010-of-00014.safetensors",
|
||||||
|
"model.layers.45.mlp.up_proj.weight": "model-00010-of-00014.safetensors",
|
||||||
|
"model.layers.45.post_attention_layernorm.weight": "model-00010-of-00014.safetensors",
|
||||||
|
"model.layers.45.self_attn.k_norm.weight": "model-00010-of-00014.safetensors",
|
||||||
|
"model.layers.45.self_attn.k_proj.weight": "model-00010-of-00014.safetensors",
|
||||||
|
"model.layers.45.self_attn.o_proj.weight": "model-00010-of-00014.safetensors",
|
||||||
|
"model.layers.45.self_attn.q_norm.weight": "model-00010-of-00014.safetensors",
|
||||||
|
"model.layers.45.self_attn.q_proj.weight": "model-00010-of-00014.safetensors",
|
||||||
|
"model.layers.45.self_attn.v_proj.weight": "model-00010-of-00014.safetensors",
|
||||||
|
"model.layers.46.input_layernorm.weight": "model-00010-of-00014.safetensors",
|
||||||
|
"model.layers.46.mlp.down_proj.weight": "model-00010-of-00014.safetensors",
|
||||||
|
"model.layers.46.mlp.gate_proj.weight": "model-00010-of-00014.safetensors",
|
||||||
|
"model.layers.46.mlp.up_proj.weight": "model-00010-of-00014.safetensors",
|
||||||
|
"model.layers.46.post_attention_layernorm.weight": "model-00010-of-00014.safetensors",
|
||||||
|
"model.layers.46.self_attn.k_norm.weight": "model-00010-of-00014.safetensors",
|
||||||
|
"model.layers.46.self_attn.k_proj.weight": "model-00010-of-00014.safetensors",
|
||||||
|
"model.layers.46.self_attn.o_proj.weight": "model-00010-of-00014.safetensors",
|
||||||
|
"model.layers.46.self_attn.q_norm.weight": "model-00010-of-00014.safetensors",
|
||||||
|
"model.layers.46.self_attn.q_proj.weight": "model-00010-of-00014.safetensors",
|
||||||
|
"model.layers.46.self_attn.v_proj.weight": "model-00010-of-00014.safetensors",
|
||||||
|
"model.layers.47.input_layernorm.weight": "model-00010-of-00014.safetensors",
|
||||||
|
"model.layers.47.mlp.down_proj.weight": "model-00010-of-00014.safetensors",
|
||||||
|
"model.layers.47.mlp.gate_proj.weight": "model-00010-of-00014.safetensors",
|
||||||
|
"model.layers.47.mlp.up_proj.weight": "model-00010-of-00014.safetensors",
|
||||||
|
"model.layers.47.post_attention_layernorm.weight": "model-00010-of-00014.safetensors",
|
||||||
|
"model.layers.47.self_attn.k_norm.weight": "model-00010-of-00014.safetensors",
|
||||||
|
"model.layers.47.self_attn.k_proj.weight": "model-00010-of-00014.safetensors",
|
||||||
|
"model.layers.47.self_attn.o_proj.weight": "model-00010-of-00014.safetensors",
|
||||||
|
"model.layers.47.self_attn.q_norm.weight": "model-00010-of-00014.safetensors",
|
||||||
|
"model.layers.47.self_attn.q_proj.weight": "model-00010-of-00014.safetensors",
|
||||||
|
"model.layers.47.self_attn.v_proj.weight": "model-00010-of-00014.safetensors",
|
||||||
|
"model.layers.48.input_layernorm.weight": "model-00011-of-00014.safetensors",
|
||||||
|
"model.layers.48.mlp.down_proj.weight": "model-00011-of-00014.safetensors",
|
||||||
|
"model.layers.48.mlp.gate_proj.weight": "model-00010-of-00014.safetensors",
|
||||||
|
"model.layers.48.mlp.up_proj.weight": "model-00011-of-00014.safetensors",
|
||||||
|
"model.layers.48.post_attention_layernorm.weight": "model-00011-of-00014.safetensors",
|
||||||
|
"model.layers.48.self_attn.k_norm.weight": "model-00010-of-00014.safetensors",
|
||||||
|
"model.layers.48.self_attn.k_proj.weight": "model-00010-of-00014.safetensors",
|
||||||
|
"model.layers.48.self_attn.o_proj.weight": "model-00010-of-00014.safetensors",
|
||||||
|
"model.layers.48.self_attn.q_norm.weight": "model-00010-of-00014.safetensors",
|
||||||
|
"model.layers.48.self_attn.q_proj.weight": "model-00010-of-00014.safetensors",
|
||||||
|
"model.layers.48.self_attn.v_proj.weight": "model-00010-of-00014.safetensors",
|
||||||
|
"model.layers.49.input_layernorm.weight": "model-00011-of-00014.safetensors",
|
||||||
|
"model.layers.49.mlp.down_proj.weight": "model-00011-of-00014.safetensors",
|
||||||
|
"model.layers.49.mlp.gate_proj.weight": "model-00011-of-00014.safetensors",
|
||||||
|
"model.layers.49.mlp.up_proj.weight": "model-00011-of-00014.safetensors",
|
||||||
|
"model.layers.49.post_attention_layernorm.weight": "model-00011-of-00014.safetensors",
|
||||||
|
"model.layers.49.self_attn.k_norm.weight": "model-00011-of-00014.safetensors",
|
||||||
|
"model.layers.49.self_attn.k_proj.weight": "model-00011-of-00014.safetensors",
|
||||||
|
"model.layers.49.self_attn.o_proj.weight": "model-00011-of-00014.safetensors",
|
||||||
|
"model.layers.49.self_attn.q_norm.weight": "model-00011-of-00014.safetensors",
|
||||||
|
"model.layers.49.self_attn.q_proj.weight": "model-00011-of-00014.safetensors",
|
||||||
|
"model.layers.49.self_attn.v_proj.weight": "model-00011-of-00014.safetensors",
|
||||||
|
"model.layers.5.input_layernorm.weight": "model-00002-of-00014.safetensors",
|
||||||
|
"model.layers.5.mlp.down_proj.weight": "model-00002-of-00014.safetensors",
|
||||||
|
"model.layers.5.mlp.gate_proj.weight": "model-00002-of-00014.safetensors",
|
||||||
|
"model.layers.5.mlp.up_proj.weight": "model-00002-of-00014.safetensors",
|
||||||
|
"model.layers.5.post_attention_layernorm.weight": "model-00002-of-00014.safetensors",
|
||||||
|
"model.layers.5.self_attn.k_norm.weight": "model-00002-of-00014.safetensors",
|
||||||
|
"model.layers.5.self_attn.k_proj.weight": "model-00002-of-00014.safetensors",
|
||||||
|
"model.layers.5.self_attn.o_proj.weight": "model-00002-of-00014.safetensors",
|
||||||
|
"model.layers.5.self_attn.q_norm.weight": "model-00002-of-00014.safetensors",
|
||||||
|
"model.layers.5.self_attn.q_proj.weight": "model-00002-of-00014.safetensors",
|
||||||
|
"model.layers.5.self_attn.v_proj.weight": "model-00002-of-00014.safetensors",
|
||||||
|
"model.layers.50.input_layernorm.weight": "model-00011-of-00014.safetensors",
|
||||||
|
"model.layers.50.mlp.down_proj.weight": "model-00011-of-00014.safetensors",
|
||||||
|
"model.layers.50.mlp.gate_proj.weight": "model-00011-of-00014.safetensors",
|
||||||
|
"model.layers.50.mlp.up_proj.weight": "model-00011-of-00014.safetensors",
|
||||||
|
"model.layers.50.post_attention_layernorm.weight": "model-00011-of-00014.safetensors",
|
||||||
|
"model.layers.50.self_attn.k_norm.weight": "model-00011-of-00014.safetensors",
|
||||||
|
"model.layers.50.self_attn.k_proj.weight": "model-00011-of-00014.safetensors",
|
||||||
|
"model.layers.50.self_attn.o_proj.weight": "model-00011-of-00014.safetensors",
|
||||||
|
"model.layers.50.self_attn.q_norm.weight": "model-00011-of-00014.safetensors",
|
||||||
|
"model.layers.50.self_attn.q_proj.weight": "model-00011-of-00014.safetensors",
|
||||||
|
"model.layers.50.self_attn.v_proj.weight": "model-00011-of-00014.safetensors",
|
||||||
|
"model.layers.51.input_layernorm.weight": "model-00011-of-00014.safetensors",
|
||||||
|
"model.layers.51.mlp.down_proj.weight": "model-00011-of-00014.safetensors",
|
||||||
|
"model.layers.51.mlp.gate_proj.weight": "model-00011-of-00014.safetensors",
|
||||||
|
"model.layers.51.mlp.up_proj.weight": "model-00011-of-00014.safetensors",
|
||||||
|
"model.layers.51.post_attention_layernorm.weight": "model-00011-of-00014.safetensors",
|
||||||
|
"model.layers.51.self_attn.k_norm.weight": "model-00011-of-00014.safetensors",
|
||||||
|
"model.layers.51.self_attn.k_proj.weight": "model-00011-of-00014.safetensors",
|
||||||
|
"model.layers.51.self_attn.o_proj.weight": "model-00011-of-00014.safetensors",
|
||||||
|
"model.layers.51.self_attn.q_norm.weight": "model-00011-of-00014.safetensors",
|
||||||
|
"model.layers.51.self_attn.q_proj.weight": "model-00011-of-00014.safetensors",
|
||||||
|
"model.layers.51.self_attn.v_proj.weight": "model-00011-of-00014.safetensors",
|
||||||
|
"model.layers.52.input_layernorm.weight": "model-00011-of-00014.safetensors",
|
||||||
|
"model.layers.52.mlp.down_proj.weight": "model-00011-of-00014.safetensors",
|
||||||
|
"model.layers.52.mlp.gate_proj.weight": "model-00011-of-00014.safetensors",
|
||||||
|
"model.layers.52.mlp.up_proj.weight": "model-00011-of-00014.safetensors",
|
||||||
|
"model.layers.52.post_attention_layernorm.weight": "model-00011-of-00014.safetensors",
|
||||||
|
"model.layers.52.self_attn.k_norm.weight": "model-00011-of-00014.safetensors",
|
||||||
|
"model.layers.52.self_attn.k_proj.weight": "model-00011-of-00014.safetensors",
|
||||||
|
"model.layers.52.self_attn.o_proj.weight": "model-00011-of-00014.safetensors",
|
||||||
|
"model.layers.52.self_attn.q_norm.weight": "model-00011-of-00014.safetensors",
|
||||||
|
"model.layers.52.self_attn.q_proj.weight": "model-00011-of-00014.safetensors",
|
||||||
|
"model.layers.52.self_attn.v_proj.weight": "model-00011-of-00014.safetensors",
|
||||||
|
"model.layers.53.input_layernorm.weight": "model-00012-of-00014.safetensors",
|
||||||
|
"model.layers.53.mlp.down_proj.weight": "model-00012-of-00014.safetensors",
|
||||||
|
"model.layers.53.mlp.gate_proj.weight": "model-00011-of-00014.safetensors",
|
||||||
|
"model.layers.53.mlp.up_proj.weight": "model-00012-of-00014.safetensors",
|
||||||
|
"model.layers.53.post_attention_layernorm.weight": "model-00012-of-00014.safetensors",
|
||||||
|
"model.layers.53.self_attn.k_norm.weight": "model-00011-of-00014.safetensors",
|
||||||
|
"model.layers.53.self_attn.k_proj.weight": "model-00011-of-00014.safetensors",
|
||||||
|
"model.layers.53.self_attn.o_proj.weight": "model-00011-of-00014.safetensors",
|
||||||
|
"model.layers.53.self_attn.q_norm.weight": "model-00011-of-00014.safetensors",
|
||||||
|
"model.layers.53.self_attn.q_proj.weight": "model-00011-of-00014.safetensors",
|
||||||
|
"model.layers.53.self_attn.v_proj.weight": "model-00011-of-00014.safetensors",
|
||||||
|
"model.layers.54.input_layernorm.weight": "model-00012-of-00014.safetensors",
|
||||||
|
"model.layers.54.mlp.down_proj.weight": "model-00012-of-00014.safetensors",
|
||||||
|
"model.layers.54.mlp.gate_proj.weight": "model-00012-of-00014.safetensors",
|
||||||
|
"model.layers.54.mlp.up_proj.weight": "model-00012-of-00014.safetensors",
|
||||||
|
"model.layers.54.post_attention_layernorm.weight": "model-00012-of-00014.safetensors",
|
||||||
|
"model.layers.54.self_attn.k_norm.weight": "model-00012-of-00014.safetensors",
|
||||||
|
"model.layers.54.self_attn.k_proj.weight": "model-00012-of-00014.safetensors",
|
||||||
|
"model.layers.54.self_attn.o_proj.weight": "model-00012-of-00014.safetensors",
|
||||||
|
"model.layers.54.self_attn.q_norm.weight": "model-00012-of-00014.safetensors",
|
||||||
|
"model.layers.54.self_attn.q_proj.weight": "model-00012-of-00014.safetensors",
|
||||||
|
"model.layers.54.self_attn.v_proj.weight": "model-00012-of-00014.safetensors",
|
||||||
|
"model.layers.55.input_layernorm.weight": "model-00012-of-00014.safetensors",
|
||||||
|
"model.layers.55.mlp.down_proj.weight": "model-00012-of-00014.safetensors",
|
||||||
|
"model.layers.55.mlp.gate_proj.weight": "model-00012-of-00014.safetensors",
|
||||||
|
"model.layers.55.mlp.up_proj.weight": "model-00012-of-00014.safetensors",
|
||||||
|
"model.layers.55.post_attention_layernorm.weight": "model-00012-of-00014.safetensors",
|
||||||
|
"model.layers.55.self_attn.k_norm.weight": "model-00012-of-00014.safetensors",
|
||||||
|
"model.layers.55.self_attn.k_proj.weight": "model-00012-of-00014.safetensors",
|
||||||
|
"model.layers.55.self_attn.o_proj.weight": "model-00012-of-00014.safetensors",
|
||||||
|
"model.layers.55.self_attn.q_norm.weight": "model-00012-of-00014.safetensors",
|
||||||
|
"model.layers.55.self_attn.q_proj.weight": "model-00012-of-00014.safetensors",
|
||||||
|
"model.layers.55.self_attn.v_proj.weight": "model-00012-of-00014.safetensors",
|
||||||
|
"model.layers.56.input_layernorm.weight": "model-00012-of-00014.safetensors",
|
||||||
|
"model.layers.56.mlp.down_proj.weight": "model-00012-of-00014.safetensors",
|
||||||
|
"model.layers.56.mlp.gate_proj.weight": "model-00012-of-00014.safetensors",
|
||||||
|
"model.layers.56.mlp.up_proj.weight": "model-00012-of-00014.safetensors",
|
||||||
|
"model.layers.56.post_attention_layernorm.weight": "model-00012-of-00014.safetensors",
|
||||||
|
"model.layers.56.self_attn.k_norm.weight": "model-00012-of-00014.safetensors",
|
||||||
|
"model.layers.56.self_attn.k_proj.weight": "model-00012-of-00014.safetensors",
|
||||||
|
"model.layers.56.self_attn.o_proj.weight": "model-00012-of-00014.safetensors",
|
||||||
|
"model.layers.56.self_attn.q_norm.weight": "model-00012-of-00014.safetensors",
|
||||||
|
"model.layers.56.self_attn.q_proj.weight": "model-00012-of-00014.safetensors",
|
||||||
|
"model.layers.56.self_attn.v_proj.weight": "model-00012-of-00014.safetensors",
|
||||||
|
"model.layers.57.input_layernorm.weight": "model-00012-of-00014.safetensors",
|
||||||
|
"model.layers.57.mlp.down_proj.weight": "model-00012-of-00014.safetensors",
|
||||||
|
"model.layers.57.mlp.gate_proj.weight": "model-00012-of-00014.safetensors",
|
||||||
|
"model.layers.57.mlp.up_proj.weight": "model-00012-of-00014.safetensors",
|
||||||
|
"model.layers.57.post_attention_layernorm.weight": "model-00012-of-00014.safetensors",
|
||||||
|
"model.layers.57.self_attn.k_norm.weight": "model-00012-of-00014.safetensors",
|
||||||
|
"model.layers.57.self_attn.k_proj.weight": "model-00012-of-00014.safetensors",
|
||||||
|
"model.layers.57.self_attn.o_proj.weight": "model-00012-of-00014.safetensors",
|
||||||
|
"model.layers.57.self_attn.q_norm.weight": "model-00012-of-00014.safetensors",
|
||||||
|
"model.layers.57.self_attn.q_proj.weight": "model-00012-of-00014.safetensors",
|
||||||
|
"model.layers.57.self_attn.v_proj.weight": "model-00012-of-00014.safetensors",
|
||||||
|
"model.layers.58.input_layernorm.weight": "model-00013-of-00014.safetensors",
|
||||||
|
"model.layers.58.mlp.down_proj.weight": "model-00013-of-00014.safetensors",
|
||||||
|
"model.layers.58.mlp.gate_proj.weight": "model-00012-of-00014.safetensors",
|
||||||
|
"model.layers.58.mlp.up_proj.weight": "model-00013-of-00014.safetensors",
|
||||||
|
"model.layers.58.post_attention_layernorm.weight": "model-00013-of-00014.safetensors",
|
||||||
|
"model.layers.58.self_attn.k_norm.weight": "model-00012-of-00014.safetensors",
|
||||||
|
"model.layers.58.self_attn.k_proj.weight": "model-00012-of-00014.safetensors",
|
||||||
|
"model.layers.58.self_attn.o_proj.weight": "model-00012-of-00014.safetensors",
|
||||||
|
"model.layers.58.self_attn.q_norm.weight": "model-00012-of-00014.safetensors",
|
||||||
|
"model.layers.58.self_attn.q_proj.weight": "model-00012-of-00014.safetensors",
|
||||||
|
"model.layers.58.self_attn.v_proj.weight": "model-00012-of-00014.safetensors",
|
||||||
|
"model.layers.59.input_layernorm.weight": "model-00013-of-00014.safetensors",
|
||||||
|
"model.layers.59.mlp.down_proj.weight": "model-00013-of-00014.safetensors",
|
||||||
|
"model.layers.59.mlp.gate_proj.weight": "model-00013-of-00014.safetensors",
|
||||||
|
"model.layers.59.mlp.up_proj.weight": "model-00013-of-00014.safetensors",
|
||||||
|
"model.layers.59.post_attention_layernorm.weight": "model-00013-of-00014.safetensors",
|
||||||
|
"model.layers.59.self_attn.k_norm.weight": "model-00013-of-00014.safetensors",
|
||||||
|
"model.layers.59.self_attn.k_proj.weight": "model-00013-of-00014.safetensors",
|
||||||
|
"model.layers.59.self_attn.o_proj.weight": "model-00013-of-00014.safetensors",
|
||||||
|
"model.layers.59.self_attn.q_norm.weight": "model-00013-of-00014.safetensors",
|
||||||
|
"model.layers.59.self_attn.q_proj.weight": "model-00013-of-00014.safetensors",
|
||||||
|
"model.layers.59.self_attn.v_proj.weight": "model-00013-of-00014.safetensors",
|
||||||
|
"model.layers.6.input_layernorm.weight": "model-00002-of-00014.safetensors",
|
||||||
|
"model.layers.6.mlp.down_proj.weight": "model-00002-of-00014.safetensors",
|
||||||
|
"model.layers.6.mlp.gate_proj.weight": "model-00002-of-00014.safetensors",
|
||||||
|
"model.layers.6.mlp.up_proj.weight": "model-00002-of-00014.safetensors",
|
||||||
|
"model.layers.6.post_attention_layernorm.weight": "model-00002-of-00014.safetensors",
|
||||||
|
"model.layers.6.self_attn.k_norm.weight": "model-00002-of-00014.safetensors",
|
||||||
|
"model.layers.6.self_attn.k_proj.weight": "model-00002-of-00014.safetensors",
|
||||||
|
"model.layers.6.self_attn.o_proj.weight": "model-00002-of-00014.safetensors",
|
||||||
|
"model.layers.6.self_attn.q_norm.weight": "model-00002-of-00014.safetensors",
|
||||||
|
"model.layers.6.self_attn.q_proj.weight": "model-00002-of-00014.safetensors",
|
||||||
|
"model.layers.6.self_attn.v_proj.weight": "model-00002-of-00014.safetensors",
|
||||||
|
"model.layers.60.input_layernorm.weight": "model-00013-of-00014.safetensors",
|
||||||
|
"model.layers.60.mlp.down_proj.weight": "model-00013-of-00014.safetensors",
|
||||||
|
"model.layers.60.mlp.gate_proj.weight": "model-00013-of-00014.safetensors",
|
||||||
|
"model.layers.60.mlp.up_proj.weight": "model-00013-of-00014.safetensors",
|
||||||
|
"model.layers.60.post_attention_layernorm.weight": "model-00013-of-00014.safetensors",
|
||||||
|
"model.layers.60.self_attn.k_norm.weight": "model-00013-of-00014.safetensors",
|
||||||
|
"model.layers.60.self_attn.k_proj.weight": "model-00013-of-00014.safetensors",
|
||||||
|
"model.layers.60.self_attn.o_proj.weight": "model-00013-of-00014.safetensors",
|
||||||
|
"model.layers.60.self_attn.q_norm.weight": "model-00013-of-00014.safetensors",
|
||||||
|
"model.layers.60.self_attn.q_proj.weight": "model-00013-of-00014.safetensors",
|
||||||
|
"model.layers.60.self_attn.v_proj.weight": "model-00013-of-00014.safetensors",
|
||||||
|
"model.layers.61.input_layernorm.weight": "model-00013-of-00014.safetensors",
|
||||||
|
"model.layers.61.mlp.down_proj.weight": "model-00013-of-00014.safetensors",
|
||||||
|
"model.layers.61.mlp.gate_proj.weight": "model-00013-of-00014.safetensors",
|
||||||
|
"model.layers.61.mlp.up_proj.weight": "model-00013-of-00014.safetensors",
|
||||||
|
"model.layers.61.post_attention_layernorm.weight": "model-00013-of-00014.safetensors",
|
||||||
|
"model.layers.61.self_attn.k_norm.weight": "model-00013-of-00014.safetensors",
|
||||||
|
"model.layers.61.self_attn.k_proj.weight": "model-00013-of-00014.safetensors",
|
||||||
|
"model.layers.61.self_attn.o_proj.weight": "model-00013-of-00014.safetensors",
|
||||||
|
"model.layers.61.self_attn.q_norm.weight": "model-00013-of-00014.safetensors",
|
||||||
|
"model.layers.61.self_attn.q_proj.weight": "model-00013-of-00014.safetensors",
|
||||||
|
"model.layers.61.self_attn.v_proj.weight": "model-00013-of-00014.safetensors",
|
||||||
|
"model.layers.62.input_layernorm.weight": "model-00013-of-00014.safetensors",
|
||||||
|
"model.layers.62.mlp.down_proj.weight": "model-00013-of-00014.safetensors",
|
||||||
|
"model.layers.62.mlp.gate_proj.weight": "model-00013-of-00014.safetensors",
|
||||||
|
"model.layers.62.mlp.up_proj.weight": "model-00013-of-00014.safetensors",
|
||||||
|
"model.layers.62.post_attention_layernorm.weight": "model-00013-of-00014.safetensors",
|
||||||
|
"model.layers.62.self_attn.k_norm.weight": "model-00013-of-00014.safetensors",
|
||||||
|
"model.layers.62.self_attn.k_proj.weight": "model-00013-of-00014.safetensors",
|
||||||
|
"model.layers.62.self_attn.o_proj.weight": "model-00013-of-00014.safetensors",
|
||||||
|
"model.layers.62.self_attn.q_norm.weight": "model-00013-of-00014.safetensors",
|
||||||
|
"model.layers.62.self_attn.q_proj.weight": "model-00013-of-00014.safetensors",
|
||||||
|
"model.layers.62.self_attn.v_proj.weight": "model-00013-of-00014.safetensors",
|
||||||
|
"model.layers.63.input_layernorm.weight": "model-00014-of-00014.safetensors",
|
||||||
|
"model.layers.63.mlp.down_proj.weight": "model-00014-of-00014.safetensors",
|
||||||
|
"model.layers.63.mlp.gate_proj.weight": "model-00013-of-00014.safetensors",
|
||||||
|
"model.layers.63.mlp.up_proj.weight": "model-00014-of-00014.safetensors",
|
||||||
|
"model.layers.63.post_attention_layernorm.weight": "model-00014-of-00014.safetensors",
|
||||||
|
"model.layers.63.self_attn.k_norm.weight": "model-00013-of-00014.safetensors",
|
||||||
|
"model.layers.63.self_attn.k_proj.weight": "model-00013-of-00014.safetensors",
|
||||||
|
"model.layers.63.self_attn.o_proj.weight": "model-00013-of-00014.safetensors",
|
||||||
|
"model.layers.63.self_attn.q_norm.weight": "model-00013-of-00014.safetensors",
|
||||||
|
"model.layers.63.self_attn.q_proj.weight": "model-00013-of-00014.safetensors",
|
||||||
|
"model.layers.63.self_attn.v_proj.weight": "model-00013-of-00014.safetensors",
|
||||||
|
"model.layers.7.input_layernorm.weight": "model-00002-of-00014.safetensors",
|
||||||
|
"model.layers.7.mlp.down_proj.weight": "model-00002-of-00014.safetensors",
|
||||||
|
"model.layers.7.mlp.gate_proj.weight": "model-00002-of-00014.safetensors",
|
||||||
|
"model.layers.7.mlp.up_proj.weight": "model-00002-of-00014.safetensors",
|
||||||
|
"model.layers.7.post_attention_layernorm.weight": "model-00002-of-00014.safetensors",
|
||||||
|
"model.layers.7.self_attn.k_norm.weight": "model-00002-of-00014.safetensors",
|
||||||
|
"model.layers.7.self_attn.k_proj.weight": "model-00002-of-00014.safetensors",
|
||||||
|
"model.layers.7.self_attn.o_proj.weight": "model-00002-of-00014.safetensors",
|
||||||
|
"model.layers.7.self_attn.q_norm.weight": "model-00002-of-00014.safetensors",
|
||||||
|
"model.layers.7.self_attn.q_proj.weight": "model-00002-of-00014.safetensors",
|
||||||
|
"model.layers.7.self_attn.v_proj.weight": "model-00002-of-00014.safetensors",
|
||||||
|
"model.layers.8.input_layernorm.weight": "model-00003-of-00014.safetensors",
|
||||||
|
"model.layers.8.mlp.down_proj.weight": "model-00003-of-00014.safetensors",
|
||||||
|
"model.layers.8.mlp.gate_proj.weight": "model-00002-of-00014.safetensors",
|
||||||
|
"model.layers.8.mlp.up_proj.weight": "model-00003-of-00014.safetensors",
|
||||||
|
"model.layers.8.post_attention_layernorm.weight": "model-00003-of-00014.safetensors",
|
||||||
|
"model.layers.8.self_attn.k_norm.weight": "model-00002-of-00014.safetensors",
|
||||||
|
"model.layers.8.self_attn.k_proj.weight": "model-00002-of-00014.safetensors",
|
||||||
|
"model.layers.8.self_attn.o_proj.weight": "model-00002-of-00014.safetensors",
|
||||||
|
"model.layers.8.self_attn.q_norm.weight": "model-00002-of-00014.safetensors",
|
||||||
|
"model.layers.8.self_attn.q_proj.weight": "model-00002-of-00014.safetensors",
|
||||||
|
"model.layers.8.self_attn.v_proj.weight": "model-00002-of-00014.safetensors",
|
||||||
|
"model.layers.9.input_layernorm.weight": "model-00003-of-00014.safetensors",
|
||||||
|
"model.layers.9.mlp.down_proj.weight": "model-00003-of-00014.safetensors",
|
||||||
|
"model.layers.9.mlp.gate_proj.weight": "model-00003-of-00014.safetensors",
|
||||||
|
"model.layers.9.mlp.up_proj.weight": "model-00003-of-00014.safetensors",
|
||||||
|
"model.layers.9.post_attention_layernorm.weight": "model-00003-of-00014.safetensors",
|
||||||
|
"model.layers.9.self_attn.k_norm.weight": "model-00003-of-00014.safetensors",
|
||||||
|
"model.layers.9.self_attn.k_proj.weight": "model-00003-of-00014.safetensors",
|
||||||
|
"model.layers.9.self_attn.o_proj.weight": "model-00003-of-00014.safetensors",
|
||||||
|
"model.layers.9.self_attn.q_norm.weight": "model-00003-of-00014.safetensors",
|
||||||
|
"model.layers.9.self_attn.q_proj.weight": "model-00003-of-00014.safetensors",
|
||||||
|
"model.layers.9.self_attn.v_proj.weight": "model-00003-of-00014.safetensors",
|
||||||
|
"model.norm.weight": "model-00014-of-00014.safetensors"
|
||||||
|
}
|
||||||
|
}
|
||||||
12
run_summary.json
Normal file
12
run_summary.json
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
{
|
||||||
|
"agent_name": "9828cc7d5cb31c19ed7e6dead76bd24dc2d66262_thinking_preprocessed",
|
||||||
|
"training_start": null,
|
||||||
|
"training_end": null,
|
||||||
|
"created_by": "raoof1",
|
||||||
|
"base_model_name": "Qwen/Qwen3-8B",
|
||||||
|
"dataset_name": "/e/scratch/jureap59/raoof1/sft_data/hf_hub/datasets--DCAgent--g1_min_episodes_e1_gpt_long_top8_glm47_traces/snapshots/9828cc7d5cb31c19ed7e6dead76bd24dc2d66262_thinking_preprocessed",
|
||||||
|
"training_type": "SFT",
|
||||||
|
"training_parameters": "https://huggingface.co/DCAgent/g1_gptlong_top8_32b/blob/main/config.json",
|
||||||
|
"wandb_link": null,
|
||||||
|
"traces_location_s3": null
|
||||||
|
}
|
||||||
31
special_tokens_map.json
Normal file
31
special_tokens_map.json
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
{
|
||||||
|
"additional_special_tokens": [
|
||||||
|
"<|im_start|>",
|
||||||
|
"<|im_end|>",
|
||||||
|
"<|object_ref_start|>",
|
||||||
|
"<|object_ref_end|>",
|
||||||
|
"<|box_start|>",
|
||||||
|
"<|box_end|>",
|
||||||
|
"<|quad_start|>",
|
||||||
|
"<|quad_end|>",
|
||||||
|
"<|vision_start|>",
|
||||||
|
"<|vision_end|>",
|
||||||
|
"<|vision_pad|>",
|
||||||
|
"<|image_pad|>",
|
||||||
|
"<|video_pad|>"
|
||||||
|
],
|
||||||
|
"eos_token": {
|
||||||
|
"content": "<|im_end|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false
|
||||||
|
},
|
||||||
|
"pad_token": {
|
||||||
|
"content": "<|endoftext|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false
|
||||||
|
}
|
||||||
|
}
|
||||||
3
tokenizer.json
Normal file
3
tokenizer.json
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4
|
||||||
|
size 11422654
|
||||||
240
tokenizer_config.json
Normal file
240
tokenizer_config.json
Normal file
@@ -0,0 +1,240 @@
|
|||||||
|
{
|
||||||
|
"add_bos_token": false,
|
||||||
|
"add_prefix_space": false,
|
||||||
|
"added_tokens_decoder": {
|
||||||
|
"151643": {
|
||||||
|
"content": "<|endoftext|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": true
|
||||||
|
},
|
||||||
|
"151644": {
|
||||||
|
"content": "<|im_start|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": true
|
||||||
|
},
|
||||||
|
"151645": {
|
||||||
|
"content": "<|im_end|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": true
|
||||||
|
},
|
||||||
|
"151646": {
|
||||||
|
"content": "<|object_ref_start|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": true
|
||||||
|
},
|
||||||
|
"151647": {
|
||||||
|
"content": "<|object_ref_end|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": true
|
||||||
|
},
|
||||||
|
"151648": {
|
||||||
|
"content": "<|box_start|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": true
|
||||||
|
},
|
||||||
|
"151649": {
|
||||||
|
"content": "<|box_end|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": true
|
||||||
|
},
|
||||||
|
"151650": {
|
||||||
|
"content": "<|quad_start|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": true
|
||||||
|
},
|
||||||
|
"151651": {
|
||||||
|
"content": "<|quad_end|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": true
|
||||||
|
},
|
||||||
|
"151652": {
|
||||||
|
"content": "<|vision_start|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": true
|
||||||
|
},
|
||||||
|
"151653": {
|
||||||
|
"content": "<|vision_end|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": true
|
||||||
|
},
|
||||||
|
"151654": {
|
||||||
|
"content": "<|vision_pad|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": true
|
||||||
|
},
|
||||||
|
"151655": {
|
||||||
|
"content": "<|image_pad|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": true
|
||||||
|
},
|
||||||
|
"151656": {
|
||||||
|
"content": "<|video_pad|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": true
|
||||||
|
},
|
||||||
|
"151657": {
|
||||||
|
"content": "<tool_call>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": false
|
||||||
|
},
|
||||||
|
"151658": {
|
||||||
|
"content": "</tool_call>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": false
|
||||||
|
},
|
||||||
|
"151659": {
|
||||||
|
"content": "<|fim_prefix|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": false
|
||||||
|
},
|
||||||
|
"151660": {
|
||||||
|
"content": "<|fim_middle|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": false
|
||||||
|
},
|
||||||
|
"151661": {
|
||||||
|
"content": "<|fim_suffix|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": false
|
||||||
|
},
|
||||||
|
"151662": {
|
||||||
|
"content": "<|fim_pad|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": false
|
||||||
|
},
|
||||||
|
"151663": {
|
||||||
|
"content": "<|repo_name|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": false
|
||||||
|
},
|
||||||
|
"151664": {
|
||||||
|
"content": "<|file_sep|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": false
|
||||||
|
},
|
||||||
|
"151665": {
|
||||||
|
"content": "<tool_response>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": false
|
||||||
|
},
|
||||||
|
"151666": {
|
||||||
|
"content": "</tool_response>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": false
|
||||||
|
},
|
||||||
|
"151667": {
|
||||||
|
"content": "<think>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": false
|
||||||
|
},
|
||||||
|
"151668": {
|
||||||
|
"content": "</think>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false,
|
||||||
|
"special": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"additional_special_tokens": [
|
||||||
|
"<|im_start|>",
|
||||||
|
"<|im_end|>",
|
||||||
|
"<|object_ref_start|>",
|
||||||
|
"<|object_ref_end|>",
|
||||||
|
"<|box_start|>",
|
||||||
|
"<|box_end|>",
|
||||||
|
"<|quad_start|>",
|
||||||
|
"<|quad_end|>",
|
||||||
|
"<|vision_start|>",
|
||||||
|
"<|vision_end|>",
|
||||||
|
"<|vision_pad|>",
|
||||||
|
"<|image_pad|>",
|
||||||
|
"<|video_pad|>"
|
||||||
|
],
|
||||||
|
"bos_token": null,
|
||||||
|
"clean_up_tokenization_spaces": false,
|
||||||
|
"eos_token": "<|im_end|>",
|
||||||
|
"errors": "replace",
|
||||||
|
"extra_special_tokens": {},
|
||||||
|
"model_max_length": 32768,
|
||||||
|
"pad_token": "<|endoftext|>",
|
||||||
|
"padding_side": "right",
|
||||||
|
"split_special_tokens": false,
|
||||||
|
"tokenizer_class": "Qwen2Tokenizer",
|
||||||
|
"unk_token": null
|
||||||
|
}
|
||||||
16
train_results.json
Normal file
16
train_results.json
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
{
|
||||||
|
"achieved_tflops_per_gpu": 0.010354545414379324,
|
||||||
|
"achieved_tflops_per_gpu_theoretical": 1519.821207581491,
|
||||||
|
"epoch": 5.0,
|
||||||
|
"loss_nan_ranks": 0,
|
||||||
|
"loss_rank_avg": 0.21731798350811005,
|
||||||
|
"mfu_percent": 0.0007317699939490688,
|
||||||
|
"mfu_percent_theoretical": 107.40785919303823,
|
||||||
|
"total_flos": 9850857283649536.0,
|
||||||
|
"train_loss": 0.03974963869139502,
|
||||||
|
"train_runtime": 9909.9567,
|
||||||
|
"train_samples_per_second": 14.475,
|
||||||
|
"train_steps_per_second": 0.151,
|
||||||
|
"valid_targets_mean": 1465.8,
|
||||||
|
"valid_targets_min": 325
|
||||||
|
}
|
||||||
3336
trainer_state.json
Normal file
3336
trainer_state.json
Normal file
File diff suppressed because it is too large
Load Diff
366
training_configs/sft.sbatch
Normal file
366
training_configs/sft.sbatch
Normal file
@@ -0,0 +1,366 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#SBATCH --time=12:00:00
|
||||||
|
#SBATCH --nodes=24
|
||||||
|
#SBATCH --ntasks-per-node=1
|
||||||
|
#SBATCH --cpus-per-task=288
|
||||||
|
#SBATCH --output=/e/scratch/jureap59/raoof1/sft_experiments/g1_gptlong_top8_32b/logs/%x_%j.out
|
||||||
|
#SBATCH --job-name=g1_gptlong_top8_32b__Qwen3-8B
|
||||||
|
#SBATCH --mail-type=END,TIME_LIMIT,FAIL
|
||||||
|
#SBATCH --mail-user=
|
||||||
|
#SBATCH -p booster
|
||||||
|
#SBATCH --account reformo
|
||||||
|
#SBATCH --gres=gpu:4
|
||||||
|
#SBATCH --exclude=jpbo-031-[01-48]
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# Universal SFT Training SBATCH Template
|
||||||
|
# ==============================================================================
|
||||||
|
# This template replaces the cluster-specific *_train.sbatch scripts by delegating
|
||||||
|
# all logic to the SFTJobRunner Python class.
|
||||||
|
#
|
||||||
|
# Usage: The launcher writes a JSON config file and substitutes /e/scratch/jureap59/raoof1/sft_experiments/g1_gptlong_top8_32b/configs/g1_gptlong_top8_32b__Qwen3-8B_sft_config.json
|
||||||
|
# ==============================================================================
|
||||||
|
|
||||||
|
source /e/scratch/jureap59/feuer1/miniforge3/etc/profile.d/conda.sh
|
||||||
|
conda activate otagent
|
||||||
|
set -eo pipefail
|
||||||
|
ml purge
|
||||||
|
ulimit -c 0 # Disable core dumps to avoid filling disk space
|
||||||
|
|
||||||
|
# Handle bash completion scripts that use BASH_COMPLETION_DEBUG
|
||||||
|
if [ -z "${BASH_COMPLETION_DEBUG+x}" ]; then
|
||||||
|
export BASH_COMPLETION_DEBUG=""
|
||||||
|
fi
|
||||||
|
|
||||||
|
# --- Clean up /tmp to prevent state pollution from previous jobs ---
|
||||||
|
# Some HPC systems retain /tmp contents across job allocations on the same node.
|
||||||
|
# This can cause issues with tmux sessions, container state, and other temporary files.
|
||||||
|
rm -rf /tmp/tmux-* 2>/dev/null || true
|
||||||
|
rm -rf /tmp/ray 2>/dev/null || true
|
||||||
|
rm -rf /tmp/hf_home 2>/dev/null || true
|
||||||
|
rm -rf /tmp/containers 2>/dev/null || true
|
||||||
|
rm -rf /tmp/podman-* 2>/dev/null || true
|
||||||
|
|
||||||
|
# Guard conda deactivate scripts from set -u complaints
|
||||||
|
export CONDA_BACKUP_CXX="${CONDA_BACKUP_CXX:-}"
|
||||||
|
export CONDA_BACKUP_CC="${CONDA_BACKUP_CC:-}"
|
||||||
|
export CONDA_BACKUP_FC="${CONDA_BACKUP_FC:-}"
|
||||||
|
|
||||||
|
# --- Module and Conda Setup ---
|
||||||
|
|
||||||
|
# --- Module loading (cluster-specific, substituted by launcher) ---
|
||||||
|
module load nvidia-compilers/25.9-CUDA-13
|
||||||
|
|
||||||
|
# --- Environment setup ---
|
||||||
|
if [ -n "${DCFT_PRIVATE:-}" ]; then
|
||||||
|
WORKDIR="$DCFT_PRIVATE"
|
||||||
|
elif [ -n "${DCFT:-}" ]; then
|
||||||
|
WORKDIR="$DCFT"
|
||||||
|
else
|
||||||
|
WORKDIR="$PWD"
|
||||||
|
fi
|
||||||
|
cd "$WORKDIR"
|
||||||
|
|
||||||
|
if [ -z "${DCFT:-}" ]; then
|
||||||
|
export DCFT="$WORKDIR"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# --- Conda activation (cluster-specific, substituted by launcher) ---
|
||||||
|
# No conda activation configured
|
||||||
|
|
||||||
|
# --- Source environment files ---
|
||||||
|
if [ -n "${DCFT:-}" ] && [ -f "$DCFT/hpc/dotenv/jupiter.env" ]; then
|
||||||
|
source "$DCFT/hpc/dotenv/jupiter.env"
|
||||||
|
fi
|
||||||
|
if [ -n "${DC_AGENT_SECRET_ENV:-}" ] && [ -f "$DC_AGENT_SECRET_ENV" ]; then
|
||||||
|
set -a
|
||||||
|
source "$DC_AGENT_SECRET_ENV"
|
||||||
|
set +a
|
||||||
|
fi
|
||||||
|
if [ -n "${DCFT_ACTIVATE_ENV:-}" ]; then
|
||||||
|
eval "$DCFT_ACTIVATE_ENV"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# --- CUDA path detection (Perlmutter and similar) ---
|
||||||
|
|
||||||
|
|
||||||
|
# --- NCCL/Networking settings (cluster-specific) ---
|
||||||
|
# Cluster-specific NCCL/networking settings
|
||||||
|
export NCCL_DEBUG="INFO"
|
||||||
|
export NCCL_NET_GDR_LEVEL="0"
|
||||||
|
export NCCL_SOCKET_IFNAME="ib0"
|
||||||
|
export NCCL_IB_TIMEOUT="60"
|
||||||
|
|
||||||
|
# --- Cluster-specific environment variables ---
|
||||||
|
export WANDB_MODE="offline"
|
||||||
|
export GLOO_USE_IPV6="0"
|
||||||
|
export NCCL_SOCKET_FAMILY="AF_INET"
|
||||||
|
export SKYRL_ENABLE_NUMA_AFFINITY="1"
|
||||||
|
export DISABLE_AIOHTTP_TRANSPORT="True"
|
||||||
|
export VLLM_ALLREDUCE_USE_SYMM_MEM="0"
|
||||||
|
|
||||||
|
# --- Ray defaults ---
|
||||||
|
# --- Ray defaults ---
|
||||||
|
export RAY_CGRAPH_get_timeout="${RAY_CGRAPH_get_timeout:-900}"
|
||||||
|
# GH200 unified memory: GPU HBM is part of system RAM, so Ray's
|
||||||
|
# memory monitor double-counts GPU allocations and kills workers
|
||||||
|
# during model loading. Disable the monitor entirely.
|
||||||
|
export RAY_memory_monitor_refresh_ms=0
|
||||||
|
if [ -z "${RAY_TMPDIR:-}" ]; then
|
||||||
|
RAY_TMPDIR_BASE="/tmp/ray"
|
||||||
|
RAY_TMPDIR="${RAY_TMPDIR_BASE}/ray_${SLURM_JOB_ID:-$$}"
|
||||||
|
mkdir -p "$RAY_TMPDIR"
|
||||||
|
fi
|
||||||
|
export RAY_TMPDIR="${RAY_TMPDIR}"
|
||||||
|
echo "[ray] RAY_TMPDIR=$RAY_TMPDIR"
|
||||||
|
|
||||||
|
# --- Standard environment variables ---
|
||||||
|
export PYTHONFAULTHANDLER=1
|
||||||
|
export TORCH_SHOW_CPP_STACKTRACES=1
|
||||||
|
export CUDA_LAUNCH_BLOCKING=0
|
||||||
|
export TORCH_NCCL_ASYNC_ERROR_HANDLING=1
|
||||||
|
export OMP_NUM_THREADS=1
|
||||||
|
export PYTHONPATH="$WORKDIR:${PYTHONPATH:-}"
|
||||||
|
export DISABLE_VERSION_CHECK=1 # Skip LlamaFactory transformers version check
|
||||||
|
|
||||||
|
# --- Distributed training setup ---
|
||||||
|
export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
|
||||||
|
# if histname contains jrc or jwb, set master addres to ${MASTER_ADDR}i to use infiniband
|
||||||
|
if [[ "$MASTER_ADDR" == *"jrc"* || "$MASTER_ADDR" == *"jwb"* ]]; then
|
||||||
|
export MASTER_ADDR="${MASTER_ADDR}i"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "MASTER_ADDR set to $MASTER_ADDR"
|
||||||
|
export MASTER_PORT=12802
|
||||||
|
export NUM_NODES=$SLURM_JOB_NUM_NODES
|
||||||
|
export NUM_GPUS_PER_NODE=4
|
||||||
|
export NUM_GPUS=$((NUM_GPUS_PER_NODE*SLURM_NNODES))
|
||||||
|
|
||||||
|
# --- HuggingFace/WandB paths ---
|
||||||
|
export HF_HOME="${HF_HOME:-${HF_HUB_CACHE:-/tmp/hf_home}}"
|
||||||
|
export WANDB_DIR="${DCFT_WANDB_DIR:-$DCFT//e/scratch/jureap59/raoof1/sft_experiments/g1_gptlong_top8_32b/wandb}"
|
||||||
|
|
||||||
|
# --- Triton/TorchInductor cache settings (node-local to avoid shared FS issues) ---
|
||||||
|
export TRITON_CACHE_VERBOSE=1
|
||||||
|
source "$WORKDIR/hpc/shell_utils/triton_cache.sh"
|
||||||
|
|
||||||
|
# --- Create experiment directories ---
|
||||||
|
mkdir -p "$DCFT//e/scratch/jureap59/raoof1/sft_experiments/g1_gptlong_top8_32b"
|
||||||
|
mkdir -p "$DCFT//e/scratch/jureap59/raoof1/sft_experiments/g1_gptlong_top8_32b/logs"
|
||||||
|
mkdir -p "$DCFT//e/scratch/jureap59/raoof1/sft_experiments/g1_gptlong_top8_32b/tmp"
|
||||||
|
|
||||||
|
# --- Supabase environment variables for DB registration ---
|
||||||
|
for _supabase_var in SUPABASE_URL SUPABASE_ANON_KEY SUPABASE_SERVICE_ROLE_KEY; do
|
||||||
|
if [[ -n "${!_supabase_var:-}" ]]; then
|
||||||
|
export "${_supabase_var}=${!_supabase_var}"
|
||||||
|
else
|
||||||
|
echo "Warning: ${_supabase_var} is not set; Supabase registration may fail." >&2
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
# --- SSH Tunneling (JSC clusters only) ---
|
||||||
|
# ============================================================================
|
||||||
|
# SSH Tunnel + Proxychains Setup for No-Internet Clusters (JSC)
|
||||||
|
#
|
||||||
|
# Creates SOCKS5 proxy via SSH tunnel to login node, then uses proxychains
|
||||||
|
# to route external traffic through the tunnel.
|
||||||
|
#
|
||||||
|
# Jupiter (ARM GH200): Uses wrapped binary approach (proxychains4 -f <config> cmd)
|
||||||
|
# Other JSC clusters: Uses LD_PRELOAD approach for Ray worker inheritance
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
# Determine login node and proxychains paths based on cluster
|
||||||
|
NODE_HOST=$(hostname -s)
|
||||||
|
PROXYCHAINS_MODE="" # "binary" or "ldpreload"
|
||||||
|
|
||||||
|
if [[ $NODE_HOST == jrc* ]]; then
|
||||||
|
LOGIN_NODE="jrlogin05i"
|
||||||
|
PROXYCHAINS_LIB="/p/scratch/synthlaion/dc-agent-shared/tools/proxychains-ng-install/lib/libproxychains4.so"
|
||||||
|
PROXYCHAINS_MODE="ldpreload"
|
||||||
|
elif [[ $NODE_HOST == jwb* ]]; then
|
||||||
|
LOGIN_NODE="jwlogin22i"
|
||||||
|
PROXYCHAINS_LIB="/p/scratch/synthlaion/dc-agent-shared/tools/proxychains-ng-install/lib/libproxychains4.so"
|
||||||
|
PROXYCHAINS_MODE="ldpreload"
|
||||||
|
elif [[ $NODE_HOST == jpb* ]] || [[ $NODE_HOST == jpc* ]]; then
|
||||||
|
LOGIN_NODE="jpbl-s01-01"
|
||||||
|
# Jupiter uses aarch64 build - binary wrapper approach (LD_PRELOAD doesn't work reliably)
|
||||||
|
PROXYCHAINS_BIN="/e/scratch/jureap59/feuer1/proxychains-ng-aarch64/bin/proxychains4"
|
||||||
|
PROXYCHAINS_MODE="binary"
|
||||||
|
elif [[ $NODE_HOST == lrdn* ]] || [[ $NODE_HOST == *.leonardo.local ]]; then
|
||||||
|
LOGIN_NODE="login05-ext.leonardo.cineca.it"
|
||||||
|
# Leonardo uses x86 build - binary wrapper approach
|
||||||
|
PROXYCHAINS_BIN="/leonardo/home/userexternal/bfeuer00/proxychains/bin/proxychains4"
|
||||||
|
PROXYCHAINS_MODE="binary"
|
||||||
|
else
|
||||||
|
echo "[proxy] Unknown cluster for node $NODE_HOST - skipping proxy setup"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
TUNNEL_PORT=7003
|
||||||
|
|
||||||
|
# Check if proxychains is available
|
||||||
|
if [[ "$PROXYCHAINS_MODE" == "binary" ]]; then
|
||||||
|
if [ ! -x "$PROXYCHAINS_BIN" ]; then
|
||||||
|
echo "[proxy] ✗ proxychains binary not found at $PROXYCHAINS_BIN"
|
||||||
|
echo "[proxy] Skipping proxy setup - external connectivity will fail"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
echo "[proxy] ✓ Found proxychains binary at $PROXYCHAINS_BIN"
|
||||||
|
else
|
||||||
|
if [ ! -f "$PROXYCHAINS_LIB" ]; then
|
||||||
|
echo "[proxy] ✗ proxychains library not found at $PROXYCHAINS_LIB"
|
||||||
|
echo "[proxy] Skipping proxy setup - external connectivity will fail"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
echo "[proxy] ✓ Found proxychains library at $PROXYCHAINS_LIB"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -z "${SSH_KEY:-}" ]; then
|
||||||
|
echo "[proxy] SSH_KEY not set - skipping proxy setup"
|
||||||
|
echo "[proxy] Set SSH_KEY in your environment to enable internet access"
|
||||||
|
else
|
||||||
|
# Get this node's IP address for multi-node proxy access
|
||||||
|
NODE_IP=$(nslookup $NODE_HOST | grep 'Address' | tail -n1 | awk '{print $2}')
|
||||||
|
echo "[proxy] Setting up SSH tunnel to $LOGIN_NODE"
|
||||||
|
echo "[proxy] SSH key: $SSH_KEY"
|
||||||
|
echo "[proxy] Tunnel port: $TUNNEL_PORT"
|
||||||
|
echo "[proxy] Node IP: $NODE_IP (workers will connect here)"
|
||||||
|
|
||||||
|
# Create SSH tunnel with SOCKS5 proxy
|
||||||
|
# -g flag allows remote hosts (worker nodes) to connect to the tunnel
|
||||||
|
ssh -g -f -N -D ${TUNNEL_PORT} \
|
||||||
|
-o StrictHostKeyChecking=no \
|
||||||
|
-o ConnectTimeout=1000 \
|
||||||
|
-o ServerAliveInterval=10 \
|
||||||
|
-o ServerAliveCountMax=30 \
|
||||||
|
-o TCPKeepAlive=yes \
|
||||||
|
-o ExitOnForwardFailure=yes \
|
||||||
|
-o BatchMode=yes \
|
||||||
|
-i ${SSH_KEY} \
|
||||||
|
${USER}@${LOGIN_NODE}
|
||||||
|
|
||||||
|
# Give tunnel time to establish
|
||||||
|
sleep 5
|
||||||
|
|
||||||
|
# Verify tunnel is running
|
||||||
|
if pgrep -f "ssh.*-D.*${TUNNEL_PORT}" > /dev/null; then
|
||||||
|
echo "[proxy] ✓ SSH tunnel started successfully"
|
||||||
|
else
|
||||||
|
echo "[proxy] ✗ SSH tunnel failed to start"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# Generate proxychains config
|
||||||
|
# Key: Uses NODE_IP (not localhost) so worker nodes can access the tunnel
|
||||||
|
# localnet entries ensure internal traffic (Ray, NCCL) bypasses proxy
|
||||||
|
# ============================================================================
|
||||||
|
SLURM_JOB_ID=${SLURM_JOB_ID:-"local"}
|
||||||
|
CFG_PATH=~/.proxychains/proxychains_${SLURM_JOB_ID}.conf
|
||||||
|
mkdir -p ~/.proxychains
|
||||||
|
|
||||||
|
cat > "$CFG_PATH" <<PCEOF
|
||||||
|
strict_chain
|
||||||
|
quiet_mode
|
||||||
|
tcp_read_time_out 30000
|
||||||
|
tcp_connect_time_out 15000
|
||||||
|
localnet 127.0.0.0/255.0.0.0
|
||||||
|
localnet 127.0.0.1/255.255.255.255
|
||||||
|
localnet 10.0.0.0/255.0.0.0
|
||||||
|
localnet 172.16.0.0/255.240.0.0
|
||||||
|
localnet 192.168.0.0/255.255.0.0
|
||||||
|
localnet 169.254.0.0/255.255.0.0
|
||||||
|
[ProxyList]
|
||||||
|
socks5 ${NODE_IP} ${TUNNEL_PORT}
|
||||||
|
PCEOF
|
||||||
|
|
||||||
|
echo "[proxy] ✓ Generated proxychains config at $CFG_PATH"
|
||||||
|
echo "[proxy] - Internal traffic (10.x.x.x, 172.x.x.x, 169.254.x.x) → DIRECT"
|
||||||
|
echo "[proxy] - External traffic (internet) → PROXY via tunnel"
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# Export proxychains configuration based on mode
|
||||||
|
# ============================================================================
|
||||||
|
export PROXYCHAINS_CONF_FILE="$CFG_PATH"
|
||||||
|
export PROXYCHAINS_SOCKS5_HOST="${NODE_IP}"
|
||||||
|
export PROXYCHAINS_SOCKS5_PORT="${TUNNEL_PORT}"
|
||||||
|
|
||||||
|
# if [[ "$PROXYCHAINS_MODE" == "binary" ]]; then
|
||||||
|
# # Binary wrapper approach (Jupiter ARM GH200)
|
||||||
|
# # Ray workers will use: proxychains4 -f $PROXYCHAINS_CONF_FILE ray start ...
|
||||||
|
# export PROXYCHAINS_BINARY="$PROXYCHAINS_BIN"
|
||||||
|
# echo "[proxy] ✓ PROXYCHAINS_BINARY=$PROXYCHAINS_BIN"
|
||||||
|
# echo "[proxy] ✓ PROXYCHAINS_CONF_FILE=$CFG_PATH"
|
||||||
|
# echo "[proxy] ✓ PROXYCHAINS_SOCKS5_HOST=${NODE_IP} (accessible from worker nodes)"
|
||||||
|
# echo "[proxy] ✓ PROXYCHAINS_SOCKS5_PORT=${TUNNEL_PORT}"
|
||||||
|
# else
|
||||||
|
# # LD_PRELOAD approach (Jureca, Juwels)
|
||||||
|
# # Ray workers inherit proxy via LD_PRELOAD environment variable
|
||||||
|
# export LD_PRELOAD="$PROXYCHAINS_LIB"
|
||||||
|
# echo "[proxy] ✓ LD_PRELOAD set to $PROXYCHAINS_LIB"
|
||||||
|
# echo "[proxy] ✓ PROXYCHAINS_CONF_FILE=$CFG_PATH"
|
||||||
|
# echo "[proxy] ✓ PROXYCHAINS_SOCKS5_HOST=${NODE_IP} (accessible from worker nodes)"
|
||||||
|
# echo "[proxy] ✓ PROXYCHAINS_SOCKS5_PORT=${TUNNEL_PORT}"
|
||||||
|
# fi
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# Daytona/aiohttp timeout and retry settings
|
||||||
|
# ============================================================================
|
||||||
|
export DAYTONA_MAX_RETRIES=5
|
||||||
|
export DAYTONA_RETRY_DELAY=30
|
||||||
|
export DAYTONA_BACKOFF_FACTOR=2
|
||||||
|
export DAYTONA_TIMEOUT=1800 # 30 minutes
|
||||||
|
export AIOHTTP_CLIENT_TIMEOUT=900 # 15 minutes
|
||||||
|
export AIOHTTP_CONNECTOR_TIMEOUT=900
|
||||||
|
export AIOHTTP_SOCK_CONNECT_TIMEOUT=300
|
||||||
|
export AIOHTTP_TOTAL_TIMEOUT=1800
|
||||||
|
|
||||||
|
# Disable SSL verification (JSC certificate issues)
|
||||||
|
export PYTHONHTTPSVERIFY=0
|
||||||
|
unset SSL_CERT_FILE
|
||||||
|
unset CURL_CA_BUNDLE
|
||||||
|
unset REQUESTS_CA_BUNDLE
|
||||||
|
unset SSL_CERT_DIR
|
||||||
|
|
||||||
|
echo "[proxy] ✓ Daytona timeout settings configured"
|
||||||
|
|
||||||
|
# Test proxy connectivity
|
||||||
|
echo "[proxy] Testing proxy connectivity..."
|
||||||
|
if [[ "$PROXYCHAINS_MODE" == "binary" ]]; then
|
||||||
|
if "$PROXYCHAINS_BIN" -f "$CFG_PATH" curl -s --connect-timeout 10 https://huggingface.co -o /dev/null; then
|
||||||
|
echo "[proxy] ✓ Proxy connectivity test passed (huggingface.co reachable via wrapped binary)"
|
||||||
|
else
|
||||||
|
echo "[proxy] ⚠ Proxy connectivity test failed (may still work for Daytona)"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
if curl -s --connect-timeout 10 https://huggingface.co -o /dev/null 2>/dev/null; then
|
||||||
|
echo "[proxy] ✓ Proxy connectivity test passed (huggingface.co reachable via LD_PRELOAD)"
|
||||||
|
else
|
||||||
|
echo "[proxy] ⚠ Proxy connectivity test failed (may still work for Daytona)"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Test that tunnel is accessible from this node's IP (for worker node access)
|
||||||
|
if nc -z ${NODE_IP} ${TUNNEL_PORT} 2>/dev/null; then
|
||||||
|
echo "[proxy] ✓ Tunnel accessible at ${NODE_IP}:${TUNNEL_PORT} (workers can connect)"
|
||||||
|
else
|
||||||
|
echo "[proxy] ⚠ Tunnel not accessible at ${NODE_IP}:${TUNNEL_PORT} (workers may fail)"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ "$PROXYCHAINS_MODE" == "binary" ]]; then
|
||||||
|
echo "[proxy] ✓ Proxy setup complete (using wrapped binary for Ray workers)"
|
||||||
|
else
|
||||||
|
echo "[proxy] ✓ Proxy setup complete (using LD_PRELOAD for Ray worker inheritance)"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
# --- Run the SFT job via Python runner ---
|
||||||
|
echo "=== Universal SFT Training Runner ==="
|
||||||
|
echo "Config: /e/scratch/jureap59/raoof1/sft_experiments/g1_gptlong_top8_32b/configs/g1_gptlong_top8_32b__Qwen3-8B_sft_config.json"
|
||||||
|
echo "Working directory: $WORKDIR"
|
||||||
|
echo "Nodes: $NUM_NODES, GPUs/node: $NUM_GPUS_PER_NODE"
|
||||||
|
echo "======================================"
|
||||||
|
|
||||||
|
echo LD_LIBRARY_PATH=$LD_LIBRARY_PATH
|
||||||
|
srun --mpi=none --nodes=24 $PROXY_CMD bash -c 'python -m hpc.sft_launch_utils --config "/e/scratch/jureap59/raoof1/sft_experiments/g1_gptlong_top8_32b/configs/g1_gptlong_top8_32b__Qwen3-8B_sft_config.json"'
|
||||||
15
training_configs/sft_config.json
Normal file
15
training_configs/sft_config.json
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
{
|
||||||
|
"job_name": "g1_gptlong_top8_32b__Qwen3-8B",
|
||||||
|
"train_config_path": "/e/scratch/jureap59/raoof1/sft_experiments/g1_gptlong_top8_32b/configs/g1_gptlong_top8_32b__Qwen3-8B_train_config.yaml",
|
||||||
|
"experiments_dir": "/e/scratch/jureap59/raoof1/sft_experiments/g1_gptlong_top8_32b",
|
||||||
|
"cluster_name": "jupiter",
|
||||||
|
"num_nodes": 24,
|
||||||
|
"gpus_per_node": 4,
|
||||||
|
"cpus_per_node": 288,
|
||||||
|
"launcher": "accelerate",
|
||||||
|
"accelerate_config_path": null,
|
||||||
|
"deepspeed_config": "sft/lf_configs/deepspeed/ds_z3_accelerate.json",
|
||||||
|
"master_port": 12802,
|
||||||
|
"needs_ssh_tunnel": true,
|
||||||
|
"needs_cuda_detection": false
|
||||||
|
}
|
||||||
52
training_configs/train_config.yaml
Normal file
52
training_configs/train_config.yaml
Normal file
@@ -0,0 +1,52 @@
|
|||||||
|
adam_beta2: 0.98
|
||||||
|
assistant_tag: assistant
|
||||||
|
attn: fa2
|
||||||
|
bf16: true
|
||||||
|
content_tag: content
|
||||||
|
cutoff_len: 32768
|
||||||
|
dataloader_num_workers: 4
|
||||||
|
dataloader_persistent_workers: true
|
||||||
|
dataloader_pin_memory: true
|
||||||
|
dataset: /e/scratch/jureap59/raoof1/sft_data/hf_hub/datasets--DCAgent--g1_min_episodes_e1_gpt_long_top8_glm47_traces/snapshots/9828cc7d5cb31c19ed7e6dead76bd24dc2d66262_thinking_preprocessed
|
||||||
|
dataset_dir: ONLINE
|
||||||
|
datasets_cache_dir: /e/scratch/jureap59/raoof1/sft_data/arrow_cache
|
||||||
|
ddp_timeout: 180000000
|
||||||
|
deepspeed: sft/lf_configs/deepspeed/ds_z3_accelerate.json
|
||||||
|
do_train: true
|
||||||
|
enable_liger_kernel: true
|
||||||
|
finetuning_type: full
|
||||||
|
formatting: sharegpt
|
||||||
|
gradient_accumulation_steps: 1
|
||||||
|
gradient_checkpointing: true
|
||||||
|
hub_model_id: DCAgent/g1_gptlong_top8_32b
|
||||||
|
include_mfu: true
|
||||||
|
learning_rate: 4.0e-05
|
||||||
|
load_best_model_at_end: false
|
||||||
|
logging_steps: 5
|
||||||
|
logging_strategy: steps
|
||||||
|
lr_scheduler_type: cosine
|
||||||
|
max_grad_norm: 0.001
|
||||||
|
messages: conversations
|
||||||
|
model_name_or_path: /e/scratch/jureap59/raoof1/sft_data/hf_hub/models--Qwen--Qwen3-32B/snapshots/9216db5781bf21249d130ec9da846c4624c16137
|
||||||
|
num_train_epochs: 5.0
|
||||||
|
optim: adamw_torch_fused
|
||||||
|
output_dir: /e/scratch/jureap59/raoof1/sft_data/checkpoints/sft_g1_gptlong_top8_32b__Qwen3-32B
|
||||||
|
overwrite_cache: true
|
||||||
|
per_device_train_batch_size: 1
|
||||||
|
plot_loss: true
|
||||||
|
preprocessing_num_workers: 16
|
||||||
|
pure_bf16: false
|
||||||
|
push_to_hub: false
|
||||||
|
role_tag: role
|
||||||
|
run_name: g1_gptlong_top8_32b__Qwen3-8B
|
||||||
|
save_steps: 300
|
||||||
|
save_strategy: steps
|
||||||
|
save_total_limit: 1
|
||||||
|
seed: 42
|
||||||
|
stage: sft
|
||||||
|
template: qwen3
|
||||||
|
trust_remote_code: true
|
||||||
|
user_tag: user
|
||||||
|
warmup_ratio: 0.1
|
||||||
|
weight_decay: 0.04
|
||||||
|
disable_shuffling: true
|
||||||
1
vocab.json
Normal file
1
vocab.json
Normal file
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user