初始化项目，由ModelHub XC社区提供模型

Model: DCAgent/g1_gptlong_top8_32b Source: Original Platform
2026-05-26 18:58:52 +08:00
commit b71d09829c
33 changed files with 156554 additions and 0 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1,36 @@
 *.7z filter=lfs diff=lfs merge=lfs -text
 *.arrow filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text
 *.bz2 filter=lfs diff=lfs merge=lfs -text
 *.ckpt filter=lfs diff=lfs merge=lfs -text
 *.ftz filter=lfs diff=lfs merge=lfs -text
 *.gz filter=lfs diff=lfs merge=lfs -text
 *.h5 filter=lfs diff=lfs merge=lfs -text
 *.joblib filter=lfs diff=lfs merge=lfs -text
 *.lfs.* filter=lfs diff=lfs merge=lfs -text
 *.mlmodel filter=lfs diff=lfs merge=lfs -text
 *.model filter=lfs diff=lfs merge=lfs -text
 *.msgpack filter=lfs diff=lfs merge=lfs -text
 *.npy filter=lfs diff=lfs merge=lfs -text
 *.npz filter=lfs diff=lfs merge=lfs -text
 *.onnx filter=lfs diff=lfs merge=lfs -text
 *.ot filter=lfs diff=lfs merge=lfs -text
 *.parquet filter=lfs diff=lfs merge=lfs -text
 *.pb filter=lfs diff=lfs merge=lfs -text
 *.pickle filter=lfs diff=lfs merge=lfs -text
 *.pkl filter=lfs diff=lfs merge=lfs -text
 *.pt filter=lfs diff=lfs merge=lfs -text
 *.pth filter=lfs diff=lfs merge=lfs -text
 *.rar filter=lfs diff=lfs merge=lfs -text
 *.safetensors filter=lfs diff=lfs merge=lfs -text
 saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.tar.* filter=lfs diff=lfs merge=lfs -text
 *.tar filter=lfs diff=lfs merge=lfs -text
 *.tflite filter=lfs diff=lfs merge=lfs -text
 *.tgz filter=lfs diff=lfs merge=lfs -text
 *.wasm filter=lfs diff=lfs merge=lfs -text
 *.xz filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 tokenizer.json filter=lfs diff=lfs merge=lfs -text
--- a/README.md
+++ b/README.md
@@ -0,0 +1,60 @@
 ---
 library_name: transformers
 license: other
 base_model: Qwen/Qwen3-32B
 tags:
 - llama-factory
 - full
 - generated_from_trainer
 model-index:
 - name: sft_g1_gptlong_top8_32b__Qwen3-32B
  results: []
 ---
 <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 should probably proofread and complete it, then remove this comment. -->
 # sft_g1_gptlong_top8_32b__Qwen3-32B
 This model is a fine-tuned version of [Qwen/Qwen3-32B](https://huggingface.co/Qwen/Qwen3-32B) on the /e/scratch/jureap59/raoof1/sft_data/hf_hub/datasets--DCAgent--g1_min_episodes_e1_gpt_long_top8_glm47_traces/snapshots/9828cc7d5cb31c19ed7e6dead76bd24dc2d66262_thinking_preprocessed dataset.
 ## Model description
 More information needed
 ## Intended uses & limitations
 More information needed
 ## Training and evaluation data
 More information needed
 ## Training procedure
 ### Training hyperparameters
 The following hyperparameters were used during training:
 - learning_rate: 4e-05
 - train_batch_size: 1
 - eval_batch_size: 8
 - seed: 42
 - distributed_type: multi-GPU
 - num_devices: 96
 - total_train_batch_size: 96
 - total_eval_batch_size: 768
 - optimizer: Use OptimizerNames.ADAMW_TORCH_FUSED with betas=(0.9,0.98) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
 - lr_scheduler_type: cosine
 - lr_scheduler_warmup_ratio: 0.1
 - num_epochs: 5.0
 ### Training results
 ### Framework versions
 - Transformers 4.57.6
 - Pytorch 2.9.1+cu130
 - Datasets 4.7.0
 - Tokenizers 0.22.2
--- a/added_tokens.json
+++ b/added_tokens.json
@@ -0,0 +1,28 @@
 {
  "</think>": 151668,
  "</tool_call>": 151658,
  "</tool_response>": 151666,
  "<think>": 151667,
  "<tool_call>": 151657,
  "<tool_response>": 151665,
  "<|box_end|>": 151649,
  "<|box_start|>": 151648,
  "<|endoftext|>": 151643,
  "<|file_sep|>": 151664,
  "<|fim_middle|>": 151660,
  "<|fim_pad|>": 151662,
  "<|fim_prefix|>": 151659,
  "<|fim_suffix|>": 151661,
  "<|im_end|>": 151645,
  "<|im_start|>": 151644,
  "<|image_pad|>": 151655,
  "<|object_ref_end|>": 151647,
  "<|object_ref_start|>": 151646,
  "<|quad_end|>": 151651,
  "<|quad_start|>": 151650,
  "<|repo_name|>": 151663,
  "<|video_pad|>": 151656,
  "<|vision_end|>": 151653,
  "<|vision_pad|>": 151654,
  "<|vision_start|>": 151652
 }
--- a/all_results.json
+++ b/all_results.json
@@ -0,0 +1,16 @@
 {
    "achieved_tflops_per_gpu": 0.010354545414379324,
    "achieved_tflops_per_gpu_theoretical": 1519.821207581491,
    "epoch": 5.0,
    "loss_nan_ranks": 0,
    "loss_rank_avg": 0.21731798350811005,
    "mfu_percent": 0.0007317699939490688,
    "mfu_percent_theoretical": 107.40785919303823,
    "total_flos": 9850857283649536.0,
    "train_loss": 0.03974963869139502,
    "train_runtime": 9909.9567,
    "train_samples_per_second": 14.475,
    "train_steps_per_second": 0.151,
    "valid_targets_mean": 1465.8,
    "valid_targets_min": 325
 }
--- a/chat_template.jinja
+++ b/chat_template.jinja
@@ -0,0 +1,89 @@
 {%- if tools %}
    {{- '<|im_start|>system\n' }}
    {%- if messages[0].role == 'system' %}
        {{- messages[0].content + '\n\n' }}
    {%- endif %}
    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
    {%- for tool in tools %}
        {{- "\n" }}
        {{- tool | tojson }}
    {%- endfor %}
    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
 {%- else %}
    {%- if messages[0].role == 'system' %}
        {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
    {%- endif %}
 {%- endif %}
 {%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
 {%- for message in messages[::-1] %}
    {%- set index = (messages|length - 1) - loop.index0 %}
    {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
        {%- set ns.multi_step_tool = false %}
        {%- set ns.last_query_index = index %}
    {%- endif %}
 {%- endfor %}
 {%- for message in messages %}
    {%- if message.content is string %}
        {%- set content = message.content %}
    {%- else %}
        {%- set content = '' %}
    {%- endif %}
    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
        {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
    {%- elif message.role == "assistant" %}
        {%- set reasoning_content = '' %}
        {%- if message.reasoning_content is string %}
            {%- set reasoning_content = message.reasoning_content %}
        {%- else %}
            {%- if '</think>' in content %}
                {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
                {%- set content = content.split('</think>')[-1].lstrip('\n') %}
            {%- endif %}
        {%- endif %}
        {%- if loop.index0 > ns.last_query_index %}
            {%- if loop.last or (not loop.last and reasoning_content) %}
                {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
            {%- else %}
                {{- '<|im_start|>' + message.role + '\n' + content }}
            {%- endif %}
        {%- else %}
            {{- '<|im_start|>' + message.role + '\n' + content }}
        {%- endif %}
        {%- if message.tool_calls %}
            {%- for tool_call in message.tool_calls %}
                {%- if (loop.first and content) or (not loop.first) %}
                    {{- '\n' }}
                {%- endif %}
                {%- if tool_call.function %}
                    {%- set tool_call = tool_call.function %}
                {%- endif %}
                {{- '<tool_call>\n{"name": "' }}
                {{- tool_call.name }}
                {{- '", "arguments": ' }}
                {%- if tool_call.arguments is string %}
                    {{- tool_call.arguments }}
                {%- else %}
                    {{- tool_call.arguments | tojson }}
                {%- endif %}
                {{- '}\n</tool_call>' }}
            {%- endfor %}
        {%- endif %}
        {{- '<|im_end|>\n' }}
    {%- elif message.role == "tool" %}
        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
            {{- '<|im_start|>user' }}
        {%- endif %}
        {{- '\n<tool_response>\n' }}
        {{- content }}
        {{- '\n</tool_response>' }}
        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
            {{- '<|im_end|>\n' }}
        {%- endif %}
    {%- endif %}
 {%- endfor %}
 {%- if add_generation_prompt %}
    {{- '<|im_start|>assistant\n' }}
    {%- if enable_thinking is defined and enable_thinking is false %}
        {{- '<think>\n\n</think>\n\n' }}
    {%- endif %}
 {%- endif %}
--- a/config.json
+++ b/config.json
@@ -0,0 +1,96 @@
 {
  "architectures": [
    "Qwen3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "dtype": "bfloat16",
  "eos_token_id": 151645,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 5120,
  "initializer_range": 0.02,
  "intermediate_size": 25600,
  "layer_types": [
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention"
  ],
  "max_position_embeddings": 40960,
  "max_window_layers": 64,
  "model_type": "qwen3",
  "num_attention_heads": 64,
  "num_hidden_layers": 64,
  "num_key_value_heads": 8,
  "pad_token_id": 151643,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 1000000,
  "sliding_window": null,
  "tie_word_embeddings": false,
  "transformers_version": "4.57.6",
  "use_cache": false,
  "use_sliding_window": false,
  "vocab_size": 151936
 }
--- a/generation_config.json
+++ b/generation_config.json
@@ -0,0 +1,12 @@
 {
  "do_sample": true,
  "eos_token_id": [
    151645,
    151643
  ],
  "pad_token_id": 151643,
  "temperature": 0.6,
  "top_k": 20,
  "top_p": 0.95,
  "transformers_version": "4.57.6"
 }
--- a/merges.txt
+++ b/merges.txt
--- a/model-00001-of-00014.safetensors
+++ b/model-00001-of-00014.safetensors
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:870ee9f46c8d5facda4a3b9315edcd76a1f0766bcda10e4b386e414b132a4184
 size 4932307584
--- a/model-00002-of-00014.safetensors
+++ b/model-00002-of-00014.safetensors
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:81bc627c621b5c983ca07605d60ac038f2885f795e538e98cddfac3850faffe5
 size 4875989696
--- a/model-00003-of-00014.safetensors
+++ b/model-00003-of-00014.safetensors
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:be969b1429d3a020a2f2abf702bc99509bb0c823fc0c567f511c2cda55c3354b
 size 4875989720
--- a/model-00004-of-00014.safetensors
+++ b/model-00004-of-00014.safetensors
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:0d316fc9d7a5f311713ecd75df2b614a00ed2f4f039ace1a951790f05cb02220
 size 4875989752
--- a/model-00005-of-00014.safetensors
+++ b/model-00005-of-00014.safetensors
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:6ada5802bfbe8a8f76ccfe9ecbaa63729035c175f55d46786d09decf24b293f0
 size 4875989752
--- a/model-00006-of-00014.safetensors
+++ b/model-00006-of-00014.safetensors
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:ffe081c3ac992a055e593d8daa9ef0a87bfc8820dbb817af9dbb0ac914f71a3c
 size 4875989752
--- a/model-00007-of-00014.safetensors
+++ b/model-00007-of-00014.safetensors
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:3e876cd68dc4341f8a03ff8d188bdcec4cfd83db29955d92a57cbe39100c31b1
 size 4875989752
--- a/model-00008-of-00014.safetensors
+++ b/model-00008-of-00014.safetensors
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:d294973f0c4e80afd24458f7cb3344ae8acc256ddd69d042b47135df5d6d6d0c
 size 4875989752
--- a/model-00009-of-00014.safetensors
+++ b/model-00009-of-00014.safetensors
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:265b102f5bba0cdc030f3755570d64b37f0ff20ba03f95014207a5da03d7c276
 size 4875989752
--- a/model-00010-of-00014.safetensors
+++ b/model-00010-of-00014.safetensors
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:9e41cfef8952d6aa251dad2095f25b9a864658cff7c2a78640b62f44fc24faf2
 size 4875989752
--- a/model-00011-of-00014.safetensors
+++ b/model-00011-of-00014.safetensors
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:a715c7d3191d0076f40d2584fd39be14d36c072d8ceb2ac66cbe151603292322
 size 4875989752
--- a/model-00012-of-00014.safetensors
+++ b/model-00012-of-00014.safetensors
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:a429b7ea08bdf1e3dddbf63acc3e790d51ea4ac7b57a9b3736040cd44866aa98
 size 4875989752
--- a/model-00013-of-00014.safetensors
+++ b/model-00013-of-00014.safetensors
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:b3654e4b77f9ba5a0e66b80c7efeb3293751b07e2c5b7c4c512824a9589ab562
 size 4875989752
--- a/model-00014-of-00014.safetensors
+++ b/model-00014-of-00014.safetensors
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:cee186efdd1b9df03ed54ec89e07a5e2c849a0a2a8668cfee8942cd924ec72c5
 size 2080144040
--- a/model.safetensors.index.json
+++ b/model.safetensors.index.json
@@ -0,0 +1,715 @@
 {
  "metadata": {
    "total_parameters": 676864,
    "total_size": 65524246528
  },
  "weight_map": {
    "lm_head.weight": "model-00014-of-00014.safetensors",
    "model.embed_tokens.weight": "model-00001-of-00014.safetensors",
    "model.layers.0.input_layernorm.weight": "model-00001-of-00014.safetensors",
    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00014.safetensors",
    "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00014.safetensors",
    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00014.safetensors",
    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00014.safetensors",
    "model.layers.0.self_attn.k_norm.weight": "model-00001-of-00014.safetensors",
    "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00014.safetensors",
    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00014.safetensors",
    "model.layers.0.self_attn.q_norm.weight": "model-00001-of-00014.safetensors",
    "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00014.safetensors",
    "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00014.safetensors",
    "model.layers.1.input_layernorm.weight": "model-00001-of-00014.safetensors",
    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00014.safetensors",
    "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00014.safetensors",
    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00014.safetensors",
    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00014.safetensors",
    "model.layers.1.self_attn.k_norm.weight": "model-00001-of-00014.safetensors",
    "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00014.safetensors",
    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00014.safetensors",
    "model.layers.1.self_attn.q_norm.weight": "model-00001-of-00014.safetensors",
    "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00014.safetensors",
    "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00014.safetensors",
    "model.layers.10.input_layernorm.weight": "model-00003-of-00014.safetensors",
    "model.layers.10.mlp.down_proj.weight": "model-00003-of-00014.safetensors",
    "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00014.safetensors",
    "model.layers.10.mlp.up_proj.weight": "model-00003-of-00014.safetensors",
    "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00014.safetensors",
    "model.layers.10.self_attn.k_norm.weight": "model-00003-of-00014.safetensors",
    "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00014.safetensors",
    "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00014.safetensors",
    "model.layers.10.self_attn.q_norm.weight": "model-00003-of-00014.safetensors",
    "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00014.safetensors",
    "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00014.safetensors",
    "model.layers.11.input_layernorm.weight": "model-00003-of-00014.safetensors",
    "model.layers.11.mlp.down_proj.weight": "model-00003-of-00014.safetensors",
    "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00014.safetensors",
    "model.layers.11.mlp.up_proj.weight": "model-00003-of-00014.safetensors",
    "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00014.safetensors",
    "model.layers.11.self_attn.k_norm.weight": "model-00003-of-00014.safetensors",
    "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00014.safetensors",
    "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00014.safetensors",
    "model.layers.11.self_attn.q_norm.weight": "model-00003-of-00014.safetensors",
    "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00014.safetensors",
    "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00014.safetensors",
    "model.layers.12.input_layernorm.weight": "model-00003-of-00014.safetensors",
    "model.layers.12.mlp.down_proj.weight": "model-00003-of-00014.safetensors",
    "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00014.safetensors",
    "model.layers.12.mlp.up_proj.weight": "model-00003-of-00014.safetensors",
    "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00014.safetensors",
    "model.layers.12.self_attn.k_norm.weight": "model-00003-of-00014.safetensors",
    "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00014.safetensors",
    "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00014.safetensors",
    "model.layers.12.self_attn.q_norm.weight": "model-00003-of-00014.safetensors",
    "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00014.safetensors",
    "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00014.safetensors",
    "model.layers.13.input_layernorm.weight": "model-00004-of-00014.safetensors",
    "model.layers.13.mlp.down_proj.weight": "model-00004-of-00014.safetensors",
    "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00014.safetensors",
    "model.layers.13.mlp.up_proj.weight": "model-00004-of-00014.safetensors",
    "model.layers.13.post_attention_layernorm.weight": "model-00004-of-00014.safetensors",
    "model.layers.13.self_attn.k_norm.weight": "model-00003-of-00014.safetensors",
    "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00014.safetensors",
    "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00014.safetensors",
    "model.layers.13.self_attn.q_norm.weight": "model-00003-of-00014.safetensors",
    "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00014.safetensors",
    "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00014.safetensors",
    "model.layers.14.input_layernorm.weight": "model-00004-of-00014.safetensors",
    "model.layers.14.mlp.down_proj.weight": "model-00004-of-00014.safetensors",
    "model.layers.14.mlp.gate_proj.weight": "model-00004-of-00014.safetensors",
    "model.layers.14.mlp.up_proj.weight": "model-00004-of-00014.safetensors",
    "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00014.safetensors",
    "model.layers.14.self_attn.k_norm.weight": "model-00004-of-00014.safetensors",
    "model.layers.14.self_attn.k_proj.weight": "model-00004-of-00014.safetensors",
    "model.layers.14.self_attn.o_proj.weight": "model-00004-of-00014.safetensors",
    "model.layers.14.self_attn.q_norm.weight": "model-00004-of-00014.safetensors",
    "model.layers.14.self_attn.q_proj.weight": "model-00004-of-00014.safetensors",
    "model.layers.14.self_attn.v_proj.weight": "model-00004-of-00014.safetensors",
    "model.layers.15.input_layernorm.weight": "model-00004-of-00014.safetensors",
    "model.layers.15.mlp.down_proj.weight": "model-00004-of-00014.safetensors",
    "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00014.safetensors",
    "model.layers.15.mlp.up_proj.weight": "model-00004-of-00014.safetensors",
    "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00014.safetensors",
    "model.layers.15.self_attn.k_norm.weight": "model-00004-of-00014.safetensors",
    "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00014.safetensors",
    "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00014.safetensors",
    "model.layers.15.self_attn.q_norm.weight": "model-00004-of-00014.safetensors",
    "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00014.safetensors",
    "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00014.safetensors",
    "model.layers.16.input_layernorm.weight": "model-00004-of-00014.safetensors",
    "model.layers.16.mlp.down_proj.weight": "model-00004-of-00014.safetensors",
    "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00014.safetensors",
    "model.layers.16.mlp.up_proj.weight": "model-00004-of-00014.safetensors",
    "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00014.safetensors",
    "model.layers.16.self_attn.k_norm.weight": "model-00004-of-00014.safetensors",
    "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00014.safetensors",
    "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00014.safetensors",
    "model.layers.16.self_attn.q_norm.weight": "model-00004-of-00014.safetensors",
    "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00014.safetensors",
    "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00014.safetensors",
    "model.layers.17.input_layernorm.weight": "model-00004-of-00014.safetensors",
    "model.layers.17.mlp.down_proj.weight": "model-00004-of-00014.safetensors",
    "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00014.safetensors",
    "model.layers.17.mlp.up_proj.weight": "model-00004-of-00014.safetensors",
    "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00014.safetensors",
    "model.layers.17.self_attn.k_norm.weight": "model-00004-of-00014.safetensors",
    "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00014.safetensors",
    "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00014.safetensors",
    "model.layers.17.self_attn.q_norm.weight": "model-00004-of-00014.safetensors",
    "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00014.safetensors",
    "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00014.safetensors",
    "model.layers.18.input_layernorm.weight": "model-00005-of-00014.safetensors",
    "model.layers.18.mlp.down_proj.weight": "model-00005-of-00014.safetensors",
    "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00014.safetensors",
    "model.layers.18.mlp.up_proj.weight": "model-00005-of-00014.safetensors",
    "model.layers.18.post_attention_layernorm.weight": "model-00005-of-00014.safetensors",
    "model.layers.18.self_attn.k_norm.weight": "model-00004-of-00014.safetensors",
    "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00014.safetensors",
    "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00014.safetensors",
    "model.layers.18.self_attn.q_norm.weight": "model-00004-of-00014.safetensors",
    "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00014.safetensors",
    "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00014.safetensors",
    "model.layers.19.input_layernorm.weight": "model-00005-of-00014.safetensors",
    "model.layers.19.mlp.down_proj.weight": "model-00005-of-00014.safetensors",
    "model.layers.19.mlp.gate_proj.weight": "model-00005-of-00014.safetensors",
    "model.layers.19.mlp.up_proj.weight": "model-00005-of-00014.safetensors",
    "model.layers.19.post_attention_layernorm.weight": "model-00005-of-00014.safetensors",
    "model.layers.19.self_attn.k_norm.weight": "model-00005-of-00014.safetensors",
    "model.layers.19.self_attn.k_proj.weight": "model-00005-of-00014.safetensors",
    "model.layers.19.self_attn.o_proj.weight": "model-00005-of-00014.safetensors",
    "model.layers.19.self_attn.q_norm.weight": "model-00005-of-00014.safetensors",
    "model.layers.19.self_attn.q_proj.weight": "model-00005-of-00014.safetensors",
    "model.layers.19.self_attn.v_proj.weight": "model-00005-of-00014.safetensors",
    "model.layers.2.input_layernorm.weight": "model-00001-of-00014.safetensors",
    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00014.safetensors",
    "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00014.safetensors",
    "model.layers.2.mlp.up_proj.weight": "model-00001-of-00014.safetensors",
    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00014.safetensors",
    "model.layers.2.self_attn.k_norm.weight": "model-00001-of-00014.safetensors",
    "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00014.safetensors",
    "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00014.safetensors",
    "model.layers.2.self_attn.q_norm.weight": "model-00001-of-00014.safetensors",
    "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00014.safetensors",
    "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00014.safetensors",
    "model.layers.20.input_layernorm.weight": "model-00005-of-00014.safetensors",
    "model.layers.20.mlp.down_proj.weight": "model-00005-of-00014.safetensors",
    "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00014.safetensors",
    "model.layers.20.mlp.up_proj.weight": "model-00005-of-00014.safetensors",
    "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00014.safetensors",
    "model.layers.20.self_attn.k_norm.weight": "model-00005-of-00014.safetensors",
    "model.layers.20.self_attn.k_proj.weight": "model-00005-of-00014.safetensors",
    "model.layers.20.self_attn.o_proj.weight": "model-00005-of-00014.safetensors",
    "model.layers.20.self_attn.q_norm.weight": "model-00005-of-00014.safetensors",
    "model.layers.20.self_attn.q_proj.weight": "model-00005-of-00014.safetensors",
    "model.layers.20.self_attn.v_proj.weight": "model-00005-of-00014.safetensors",
    "model.layers.21.input_layernorm.weight": "model-00005-of-00014.safetensors",
    "model.layers.21.mlp.down_proj.weight": "model-00005-of-00014.safetensors",
    "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00014.safetensors",
    "model.layers.21.mlp.up_proj.weight": "model-00005-of-00014.safetensors",
    "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00014.safetensors",
    "model.layers.21.self_attn.k_norm.weight": "model-00005-of-00014.safetensors",
    "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00014.safetensors",
    "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00014.safetensors",
    "model.layers.21.self_attn.q_norm.weight": "model-00005-of-00014.safetensors",
    "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00014.safetensors",
    "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00014.safetensors",
    "model.layers.22.input_layernorm.weight": "model-00005-of-00014.safetensors",
    "model.layers.22.mlp.down_proj.weight": "model-00005-of-00014.safetensors",
    "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00014.safetensors",
    "model.layers.22.mlp.up_proj.weight": "model-00005-of-00014.safetensors",
    "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00014.safetensors",
    "model.layers.22.self_attn.k_norm.weight": "model-00005-of-00014.safetensors",
    "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00014.safetensors",
    "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00014.safetensors",
    "model.layers.22.self_attn.q_norm.weight": "model-00005-of-00014.safetensors",
    "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00014.safetensors",
    "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00014.safetensors",
    "model.layers.23.input_layernorm.weight": "model-00006-of-00014.safetensors",
    "model.layers.23.mlp.down_proj.weight": "model-00006-of-00014.safetensors",
    "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00014.safetensors",
    "model.layers.23.mlp.up_proj.weight": "model-00006-of-00014.safetensors",
    "model.layers.23.post_attention_layernorm.weight": "model-00006-of-00014.safetensors",
    "model.layers.23.self_attn.k_norm.weight": "model-00005-of-00014.safetensors",
    "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00014.safetensors",
    "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00014.safetensors",
    "model.layers.23.self_attn.q_norm.weight": "model-00005-of-00014.safetensors",
    "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00014.safetensors",
    "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00014.safetensors",
    "model.layers.24.input_layernorm.weight": "model-00006-of-00014.safetensors",
    "model.layers.24.mlp.down_proj.weight": "model-00006-of-00014.safetensors",
    "model.layers.24.mlp.gate_proj.weight": "model-00006-of-00014.safetensors",
    "model.layers.24.mlp.up_proj.weight": "model-00006-of-00014.safetensors",
    "model.layers.24.post_attention_layernorm.weight": "model-00006-of-00014.safetensors",
    "model.layers.24.self_attn.k_norm.weight": "model-00006-of-00014.safetensors",
    "model.layers.24.self_attn.k_proj.weight": "model-00006-of-00014.safetensors",
    "model.layers.24.self_attn.o_proj.weight": "model-00006-of-00014.safetensors",
    "model.layers.24.self_attn.q_norm.weight": "model-00006-of-00014.safetensors",
    "model.layers.24.self_attn.q_proj.weight": "model-00006-of-00014.safetensors",
    "model.layers.24.self_attn.v_proj.weight": "model-00006-of-00014.safetensors",
    "model.layers.25.input_layernorm.weight": "model-00006-of-00014.safetensors",
    "model.layers.25.mlp.down_proj.weight": "model-00006-of-00014.safetensors",
    "model.layers.25.mlp.gate_proj.weight": "model-00006-of-00014.safetensors",
    "model.layers.25.mlp.up_proj.weight": "model-00006-of-00014.safetensors",
    "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00014.safetensors",
    "model.layers.25.self_attn.k_norm.weight": "model-00006-of-00014.safetensors",
    "model.layers.25.self_attn.k_proj.weight": "model-00006-of-00014.safetensors",
    "model.layers.25.self_attn.o_proj.weight": "model-00006-of-00014.safetensors",
    "model.layers.25.self_attn.q_norm.weight": "model-00006-of-00014.safetensors",
    "model.layers.25.self_attn.q_proj.weight": "model-00006-of-00014.safetensors",
    "model.layers.25.self_attn.v_proj.weight": "model-00006-of-00014.safetensors",
    "model.layers.26.input_layernorm.weight": "model-00006-of-00014.safetensors",
    "model.layers.26.mlp.down_proj.weight": "model-00006-of-00014.safetensors",
    "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00014.safetensors",
    "model.layers.26.mlp.up_proj.weight": "model-00006-of-00014.safetensors",
    "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00014.safetensors",
    "model.layers.26.self_attn.k_norm.weight": "model-00006-of-00014.safetensors",
    "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00014.safetensors",
    "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00014.safetensors",
    "model.layers.26.self_attn.q_norm.weight": "model-00006-of-00014.safetensors",
    "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00014.safetensors",
    "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00014.safetensors",
    "model.layers.27.input_layernorm.weight": "model-00006-of-00014.safetensors",
    "model.layers.27.mlp.down_proj.weight": "model-00006-of-00014.safetensors",
    "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00014.safetensors",
    "model.layers.27.mlp.up_proj.weight": "model-00006-of-00014.safetensors",
    "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00014.safetensors",
    "model.layers.27.self_attn.k_norm.weight": "model-00006-of-00014.safetensors",
    "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00014.safetensors",
    "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00014.safetensors",
    "model.layers.27.self_attn.q_norm.weight": "model-00006-of-00014.safetensors",
    "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00014.safetensors",
    "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00014.safetensors",
    "model.layers.28.input_layernorm.weight": "model-00007-of-00014.safetensors",
    "model.layers.28.mlp.down_proj.weight": "model-00007-of-00014.safetensors",
    "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00014.safetensors",
    "model.layers.28.mlp.up_proj.weight": "model-00007-of-00014.safetensors",
    "model.layers.28.post_attention_layernorm.weight": "model-00007-of-00014.safetensors",
    "model.layers.28.self_attn.k_norm.weight": "model-00006-of-00014.safetensors",
    "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00014.safetensors",
    "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00014.safetensors",
    "model.layers.28.self_attn.q_norm.weight": "model-00006-of-00014.safetensors",
    "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00014.safetensors",
    "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00014.safetensors",
    "model.layers.29.input_layernorm.weight": "model-00007-of-00014.safetensors",
    "model.layers.29.mlp.down_proj.weight": "model-00007-of-00014.safetensors",
    "model.layers.29.mlp.gate_proj.weight": "model-00007-of-00014.safetensors",
    "model.layers.29.mlp.up_proj.weight": "model-00007-of-00014.safetensors",
    "model.layers.29.post_attention_layernorm.weight": "model-00007-of-00014.safetensors",
    "model.layers.29.self_attn.k_norm.weight": "model-00007-of-00014.safetensors",
    "model.layers.29.self_attn.k_proj.weight": "model-00007-of-00014.safetensors",
    "model.layers.29.self_attn.o_proj.weight": "model-00007-of-00014.safetensors",
    "model.layers.29.self_attn.q_norm.weight": "model-00007-of-00014.safetensors",
    "model.layers.29.self_attn.q_proj.weight": "model-00007-of-00014.safetensors",
    "model.layers.29.self_attn.v_proj.weight": "model-00007-of-00014.safetensors",
    "model.layers.3.input_layernorm.weight": "model-00002-of-00014.safetensors",
    "model.layers.3.mlp.down_proj.weight": "model-00002-of-00014.safetensors",
    "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00014.safetensors",
    "model.layers.3.mlp.up_proj.weight": "model-00002-of-00014.safetensors",
    "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00014.safetensors",
    "model.layers.3.self_attn.k_norm.weight": "model-00001-of-00014.safetensors",
    "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00014.safetensors",
    "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00014.safetensors",
    "model.layers.3.self_attn.q_norm.weight": "model-00001-of-00014.safetensors",
    "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00014.safetensors",
    "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00014.safetensors",
    "model.layers.30.input_layernorm.weight": "model-00007-of-00014.safetensors",
    "model.layers.30.mlp.down_proj.weight": "model-00007-of-00014.safetensors",
    "model.layers.30.mlp.gate_proj.weight": "model-00007-of-00014.safetensors",
    "model.layers.30.mlp.up_proj.weight": "model-00007-of-00014.safetensors",
    "model.layers.30.post_attention_layernorm.weight": "model-00007-of-00014.safetensors",
    "model.layers.30.self_attn.k_norm.weight": "model-00007-of-00014.safetensors",
    "model.layers.30.self_attn.k_proj.weight": "model-00007-of-00014.safetensors",
    "model.layers.30.self_attn.o_proj.weight": "model-00007-of-00014.safetensors",
    "model.layers.30.self_attn.q_norm.weight": "model-00007-of-00014.safetensors",
    "model.layers.30.self_attn.q_proj.weight": "model-00007-of-00014.safetensors",
    "model.layers.30.self_attn.v_proj.weight": "model-00007-of-00014.safetensors",
    "model.layers.31.input_layernorm.weight": "model-00007-of-00014.safetensors",
    "model.layers.31.mlp.down_proj.weight": "model-00007-of-00014.safetensors",
    "model.layers.31.mlp.gate_proj.weight": "model-00007-of-00014.safetensors",
    "model.layers.31.mlp.up_proj.weight": "model-00007-of-00014.safetensors",
    "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00014.safetensors",
    "model.layers.31.self_attn.k_norm.weight": "model-00007-of-00014.safetensors",
    "model.layers.31.self_attn.k_proj.weight": "model-00007-of-00014.safetensors",
    "model.layers.31.self_attn.o_proj.weight": "model-00007-of-00014.safetensors",
    "model.layers.31.self_attn.q_norm.weight": "model-00007-of-00014.safetensors",
    "model.layers.31.self_attn.q_proj.weight": "model-00007-of-00014.safetensors",
    "model.layers.31.self_attn.v_proj.weight": "model-00007-of-00014.safetensors",
    "model.layers.32.input_layernorm.weight": "model-00007-of-00014.safetensors",
    "model.layers.32.mlp.down_proj.weight": "model-00007-of-00014.safetensors",
    "model.layers.32.mlp.gate_proj.weight": "model-00007-of-00014.safetensors",
    "model.layers.32.mlp.up_proj.weight": "model-00007-of-00014.safetensors",
    "model.layers.32.post_attention_layernorm.weight": "model-00007-of-00014.safetensors",
    "model.layers.32.self_attn.k_norm.weight": "model-00007-of-00014.safetensors",
    "model.layers.32.self_attn.k_proj.weight": "model-00007-of-00014.safetensors",
    "model.layers.32.self_attn.o_proj.weight": "model-00007-of-00014.safetensors",
    "model.layers.32.self_attn.q_norm.weight": "model-00007-of-00014.safetensors",
    "model.layers.32.self_attn.q_proj.weight": "model-00007-of-00014.safetensors",
    "model.layers.32.self_attn.v_proj.weight": "model-00007-of-00014.safetensors",
    "model.layers.33.input_layernorm.weight": "model-00008-of-00014.safetensors",
    "model.layers.33.mlp.down_proj.weight": "model-00008-of-00014.safetensors",
    "model.layers.33.mlp.gate_proj.weight": "model-00007-of-00014.safetensors",
    "model.layers.33.mlp.up_proj.weight": "model-00008-of-00014.safetensors",
    "model.layers.33.post_attention_layernorm.weight": "model-00008-of-00014.safetensors",
    "model.layers.33.self_attn.k_norm.weight": "model-00007-of-00014.safetensors",
    "model.layers.33.self_attn.k_proj.weight": "model-00007-of-00014.safetensors",
    "model.layers.33.self_attn.o_proj.weight": "model-00007-of-00014.safetensors",
    "model.layers.33.self_attn.q_norm.weight": "model-00007-of-00014.safetensors",
    "model.layers.33.self_attn.q_proj.weight": "model-00007-of-00014.safetensors",
    "model.layers.33.self_attn.v_proj.weight": "model-00007-of-00014.safetensors",
    "model.layers.34.input_layernorm.weight": "model-00008-of-00014.safetensors",
    "model.layers.34.mlp.down_proj.weight": "model-00008-of-00014.safetensors",
    "model.layers.34.mlp.gate_proj.weight": "model-00008-of-00014.safetensors",
    "model.layers.34.mlp.up_proj.weight": "model-00008-of-00014.safetensors",
    "model.layers.34.post_attention_layernorm.weight": "model-00008-of-00014.safetensors",
    "model.layers.34.self_attn.k_norm.weight": "model-00008-of-00014.safetensors",
    "model.layers.34.self_attn.k_proj.weight": "model-00008-of-00014.safetensors",
    "model.layers.34.self_attn.o_proj.weight": "model-00008-of-00014.safetensors",
    "model.layers.34.self_attn.q_norm.weight": "model-00008-of-00014.safetensors",
    "model.layers.34.self_attn.q_proj.weight": "model-00008-of-00014.safetensors",
    "model.layers.34.self_attn.v_proj.weight": "model-00008-of-00014.safetensors",
    "model.layers.35.input_layernorm.weight": "model-00008-of-00014.safetensors",
    "model.layers.35.mlp.down_proj.weight": "model-00008-of-00014.safetensors",
    "model.layers.35.mlp.gate_proj.weight": "model-00008-of-00014.safetensors",
    "model.layers.35.mlp.up_proj.weight": "model-00008-of-00014.safetensors",
    "model.layers.35.post_attention_layernorm.weight": "model-00008-of-00014.safetensors",
    "model.layers.35.self_attn.k_norm.weight": "model-00008-of-00014.safetensors",
    "model.layers.35.self_attn.k_proj.weight": "model-00008-of-00014.safetensors",
    "model.layers.35.self_attn.o_proj.weight": "model-00008-of-00014.safetensors",
    "model.layers.35.self_attn.q_norm.weight": "model-00008-of-00014.safetensors",
    "model.layers.35.self_attn.q_proj.weight": "model-00008-of-00014.safetensors",
    "model.layers.35.self_attn.v_proj.weight": "model-00008-of-00014.safetensors",
    "model.layers.36.input_layernorm.weight": "model-00008-of-00014.safetensors",
    "model.layers.36.mlp.down_proj.weight": "model-00008-of-00014.safetensors",
    "model.layers.36.mlp.gate_proj.weight": "model-00008-of-00014.safetensors",
    "model.layers.36.mlp.up_proj.weight": "model-00008-of-00014.safetensors",
    "model.layers.36.post_attention_layernorm.weight": "model-00008-of-00014.safetensors",
    "model.layers.36.self_attn.k_norm.weight": "model-00008-of-00014.safetensors",
    "model.layers.36.self_attn.k_proj.weight": "model-00008-of-00014.safetensors",
    "model.layers.36.self_attn.o_proj.weight": "model-00008-of-00014.safetensors",
    "model.layers.36.self_attn.q_norm.weight": "model-00008-of-00014.safetensors",
    "model.layers.36.self_attn.q_proj.weight": "model-00008-of-00014.safetensors",
    "model.layers.36.self_attn.v_proj.weight": "model-00008-of-00014.safetensors",
    "model.layers.37.input_layernorm.weight": "model-00008-of-00014.safetensors",
    "model.layers.37.mlp.down_proj.weight": "model-00008-of-00014.safetensors",
    "model.layers.37.mlp.gate_proj.weight": "model-00008-of-00014.safetensors",
    "model.layers.37.mlp.up_proj.weight": "model-00008-of-00014.safetensors",
    "model.layers.37.post_attention_layernorm.weight": "model-00008-of-00014.safetensors",
    "model.layers.37.self_attn.k_norm.weight": "model-00008-of-00014.safetensors",
    "model.layers.37.self_attn.k_proj.weight": "model-00008-of-00014.safetensors",
    "model.layers.37.self_attn.o_proj.weight": "model-00008-of-00014.safetensors",
    "model.layers.37.self_attn.q_norm.weight": "model-00008-of-00014.safetensors",
    "model.layers.37.self_attn.q_proj.weight": "model-00008-of-00014.safetensors",
    "model.layers.37.self_attn.v_proj.weight": "model-00008-of-00014.safetensors",
    "model.layers.38.input_layernorm.weight": "model-00009-of-00014.safetensors",
    "model.layers.38.mlp.down_proj.weight": "model-00009-of-00014.safetensors",
    "model.layers.38.mlp.gate_proj.weight": "model-00008-of-00014.safetensors",
    "model.layers.38.mlp.up_proj.weight": "model-00009-of-00014.safetensors",
    "model.layers.38.post_attention_layernorm.weight": "model-00009-of-00014.safetensors",
    "model.layers.38.self_attn.k_norm.weight": "model-00008-of-00014.safetensors",
    "model.layers.38.self_attn.k_proj.weight": "model-00008-of-00014.safetensors",
    "model.layers.38.self_attn.o_proj.weight": "model-00008-of-00014.safetensors",
    "model.layers.38.self_attn.q_norm.weight": "model-00008-of-00014.safetensors",
    "model.layers.38.self_attn.q_proj.weight": "model-00008-of-00014.safetensors",
    "model.layers.38.self_attn.v_proj.weight": "model-00008-of-00014.safetensors",
    "model.layers.39.input_layernorm.weight": "model-00009-of-00014.safetensors",
    "model.layers.39.mlp.down_proj.weight": "model-00009-of-00014.safetensors",
    "model.layers.39.mlp.gate_proj.weight": "model-00009-of-00014.safetensors",
    "model.layers.39.mlp.up_proj.weight": "model-00009-of-00014.safetensors",
    "model.layers.39.post_attention_layernorm.weight": "model-00009-of-00014.safetensors",
    "model.layers.39.self_attn.k_norm.weight": "model-00009-of-00014.safetensors",
    "model.layers.39.self_attn.k_proj.weight": "model-00009-of-00014.safetensors",
    "model.layers.39.self_attn.o_proj.weight": "model-00009-of-00014.safetensors",
    "model.layers.39.self_attn.q_norm.weight": "model-00009-of-00014.safetensors",
    "model.layers.39.self_attn.q_proj.weight": "model-00009-of-00014.safetensors",
    "model.layers.39.self_attn.v_proj.weight": "model-00009-of-00014.safetensors",
    "model.layers.4.input_layernorm.weight": "model-00002-of-00014.safetensors",
    "model.layers.4.mlp.down_proj.weight": "model-00002-of-00014.safetensors",
    "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00014.safetensors",
    "model.layers.4.mlp.up_proj.weight": "model-00002-of-00014.safetensors",
    "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00014.safetensors",
    "model.layers.4.self_attn.k_norm.weight": "model-00002-of-00014.safetensors",
    "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00014.safetensors",
    "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00014.safetensors",
    "model.layers.4.self_attn.q_norm.weight": "model-00002-of-00014.safetensors",
    "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00014.safetensors",
    "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00014.safetensors",
    "model.layers.40.input_layernorm.weight": "model-00009-of-00014.safetensors",
    "model.layers.40.mlp.down_proj.weight": "model-00009-of-00014.safetensors",
    "model.layers.40.mlp.gate_proj.weight": "model-00009-of-00014.safetensors",
    "model.layers.40.mlp.up_proj.weight": "model-00009-of-00014.safetensors",
    "model.layers.40.post_attention_layernorm.weight": "model-00009-of-00014.safetensors",
    "model.layers.40.self_attn.k_norm.weight": "model-00009-of-00014.safetensors",
    "model.layers.40.self_attn.k_proj.weight": "model-00009-of-00014.safetensors",
    "model.layers.40.self_attn.o_proj.weight": "model-00009-of-00014.safetensors",
    "model.layers.40.self_attn.q_norm.weight": "model-00009-of-00014.safetensors",
    "model.layers.40.self_attn.q_proj.weight": "model-00009-of-00014.safetensors",
    "model.layers.40.self_attn.v_proj.weight": "model-00009-of-00014.safetensors",
    "model.layers.41.input_layernorm.weight": "model-00009-of-00014.safetensors",
    "model.layers.41.mlp.down_proj.weight": "model-00009-of-00014.safetensors",
    "model.layers.41.mlp.gate_proj.weight": "model-00009-of-00014.safetensors",
    "model.layers.41.mlp.up_proj.weight": "model-00009-of-00014.safetensors",
    "model.layers.41.post_attention_layernorm.weight": "model-00009-of-00014.safetensors",
    "model.layers.41.self_attn.k_norm.weight": "model-00009-of-00014.safetensors",
    "model.layers.41.self_attn.k_proj.weight": "model-00009-of-00014.safetensors",
    "model.layers.41.self_attn.o_proj.weight": "model-00009-of-00014.safetensors",
    "model.layers.41.self_attn.q_norm.weight": "model-00009-of-00014.safetensors",
    "model.layers.41.self_attn.q_proj.weight": "model-00009-of-00014.safetensors",
    "model.layers.41.self_attn.v_proj.weight": "model-00009-of-00014.safetensors",
    "model.layers.42.input_layernorm.weight": "model-00009-of-00014.safetensors",
    "model.layers.42.mlp.down_proj.weight": "model-00009-of-00014.safetensors",
    "model.layers.42.mlp.gate_proj.weight": "model-00009-of-00014.safetensors",
    "model.layers.42.mlp.up_proj.weight": "model-00009-of-00014.safetensors",
    "model.layers.42.post_attention_layernorm.weight": "model-00009-of-00014.safetensors",
    "model.layers.42.self_attn.k_norm.weight": "model-00009-of-00014.safetensors",
    "model.layers.42.self_attn.k_proj.weight": "model-00009-of-00014.safetensors",
    "model.layers.42.self_attn.o_proj.weight": "model-00009-of-00014.safetensors",
    "model.layers.42.self_attn.q_norm.weight": "model-00009-of-00014.safetensors",
    "model.layers.42.self_attn.q_proj.weight": "model-00009-of-00014.safetensors",
    "model.layers.42.self_attn.v_proj.weight": "model-00009-of-00014.safetensors",
    "model.layers.43.input_layernorm.weight": "model-00010-of-00014.safetensors",
    "model.layers.43.mlp.down_proj.weight": "model-00010-of-00014.safetensors",
    "model.layers.43.mlp.gate_proj.weight": "model-00009-of-00014.safetensors",
    "model.layers.43.mlp.up_proj.weight": "model-00010-of-00014.safetensors",
    "model.layers.43.post_attention_layernorm.weight": "model-00010-of-00014.safetensors",
    "model.layers.43.self_attn.k_norm.weight": "model-00009-of-00014.safetensors",
    "model.layers.43.self_attn.k_proj.weight": "model-00009-of-00014.safetensors",
    "model.layers.43.self_attn.o_proj.weight": "model-00009-of-00014.safetensors",
    "model.layers.43.self_attn.q_norm.weight": "model-00009-of-00014.safetensors",
    "model.layers.43.self_attn.q_proj.weight": "model-00009-of-00014.safetensors",
    "model.layers.43.self_attn.v_proj.weight": "model-00009-of-00014.safetensors",
    "model.layers.44.input_layernorm.weight": "model-00010-of-00014.safetensors",
    "model.layers.44.mlp.down_proj.weight": "model-00010-of-00014.safetensors",
    "model.layers.44.mlp.gate_proj.weight": "model-00010-of-00014.safetensors",
    "model.layers.44.mlp.up_proj.weight": "model-00010-of-00014.safetensors",
    "model.layers.44.post_attention_layernorm.weight": "model-00010-of-00014.safetensors",
    "model.layers.44.self_attn.k_norm.weight": "model-00010-of-00014.safetensors",
    "model.layers.44.self_attn.k_proj.weight": "model-00010-of-00014.safetensors",
    "model.layers.44.self_attn.o_proj.weight": "model-00010-of-00014.safetensors",
    "model.layers.44.self_attn.q_norm.weight": "model-00010-of-00014.safetensors",
    "model.layers.44.self_attn.q_proj.weight": "model-00010-of-00014.safetensors",
    "model.layers.44.self_attn.v_proj.weight": "model-00010-of-00014.safetensors",
    "model.layers.45.input_layernorm.weight": "model-00010-of-00014.safetensors",
    "model.layers.45.mlp.down_proj.weight": "model-00010-of-00014.safetensors",
    "model.layers.45.mlp.gate_proj.weight": "model-00010-of-00014.safetensors",
    "model.layers.45.mlp.up_proj.weight": "model-00010-of-00014.safetensors",
    "model.layers.45.post_attention_layernorm.weight": "model-00010-of-00014.safetensors",
    "model.layers.45.self_attn.k_norm.weight": "model-00010-of-00014.safetensors",
    "model.layers.45.self_attn.k_proj.weight": "model-00010-of-00014.safetensors",
    "model.layers.45.self_attn.o_proj.weight": "model-00010-of-00014.safetensors",
    "model.layers.45.self_attn.q_norm.weight": "model-00010-of-00014.safetensors",
    "model.layers.45.self_attn.q_proj.weight": "model-00010-of-00014.safetensors",
    "model.layers.45.self_attn.v_proj.weight": "model-00010-of-00014.safetensors",
    "model.layers.46.input_layernorm.weight": "model-00010-of-00014.safetensors",
    "model.layers.46.mlp.down_proj.weight": "model-00010-of-00014.safetensors",
    "model.layers.46.mlp.gate_proj.weight": "model-00010-of-00014.safetensors",
    "model.layers.46.mlp.up_proj.weight": "model-00010-of-00014.safetensors",
    "model.layers.46.post_attention_layernorm.weight": "model-00010-of-00014.safetensors",
    "model.layers.46.self_attn.k_norm.weight": "model-00010-of-00014.safetensors",
    "model.layers.46.self_attn.k_proj.weight": "model-00010-of-00014.safetensors",
    "model.layers.46.self_attn.o_proj.weight": "model-00010-of-00014.safetensors",
    "model.layers.46.self_attn.q_norm.weight": "model-00010-of-00014.safetensors",
    "model.layers.46.self_attn.q_proj.weight": "model-00010-of-00014.safetensors",
    "model.layers.46.self_attn.v_proj.weight": "model-00010-of-00014.safetensors",
    "model.layers.47.input_layernorm.weight": "model-00010-of-00014.safetensors",
    "model.layers.47.mlp.down_proj.weight": "model-00010-of-00014.safetensors",
    "model.layers.47.mlp.gate_proj.weight": "model-00010-of-00014.safetensors",
    "model.layers.47.mlp.up_proj.weight": "model-00010-of-00014.safetensors",
    "model.layers.47.post_attention_layernorm.weight": "model-00010-of-00014.safetensors",
    "model.layers.47.self_attn.k_norm.weight": "model-00010-of-00014.safetensors",
    "model.layers.47.self_attn.k_proj.weight": "model-00010-of-00014.safetensors",
    "model.layers.47.self_attn.o_proj.weight": "model-00010-of-00014.safetensors",
    "model.layers.47.self_attn.q_norm.weight": "model-00010-of-00014.safetensors",
    "model.layers.47.self_attn.q_proj.weight": "model-00010-of-00014.safetensors",
    "model.layers.47.self_attn.v_proj.weight": "model-00010-of-00014.safetensors",
    "model.layers.48.input_layernorm.weight": "model-00011-of-00014.safetensors",
    "model.layers.48.mlp.down_proj.weight": "model-00011-of-00014.safetensors",
    "model.layers.48.mlp.gate_proj.weight": "model-00010-of-00014.safetensors",
    "model.layers.48.mlp.up_proj.weight": "model-00011-of-00014.safetensors",
    "model.layers.48.post_attention_layernorm.weight": "model-00011-of-00014.safetensors",
    "model.layers.48.self_attn.k_norm.weight": "model-00010-of-00014.safetensors",
    "model.layers.48.self_attn.k_proj.weight": "model-00010-of-00014.safetensors",
    "model.layers.48.self_attn.o_proj.weight": "model-00010-of-00014.safetensors",
    "model.layers.48.self_attn.q_norm.weight": "model-00010-of-00014.safetensors",
    "model.layers.48.self_attn.q_proj.weight": "model-00010-of-00014.safetensors",
    "model.layers.48.self_attn.v_proj.weight": "model-00010-of-00014.safetensors",
    "model.layers.49.input_layernorm.weight": "model-00011-of-00014.safetensors",
    "model.layers.49.mlp.down_proj.weight": "model-00011-of-00014.safetensors",
    "model.layers.49.mlp.gate_proj.weight": "model-00011-of-00014.safetensors",
    "model.layers.49.mlp.up_proj.weight": "model-00011-of-00014.safetensors",
    "model.layers.49.post_attention_layernorm.weight": "model-00011-of-00014.safetensors",
    "model.layers.49.self_attn.k_norm.weight": "model-00011-of-00014.safetensors",
    "model.layers.49.self_attn.k_proj.weight": "model-00011-of-00014.safetensors",
    "model.layers.49.self_attn.o_proj.weight": "model-00011-of-00014.safetensors",
    "model.layers.49.self_attn.q_norm.weight": "model-00011-of-00014.safetensors",
    "model.layers.49.self_attn.q_proj.weight": "model-00011-of-00014.safetensors",
    "model.layers.49.self_attn.v_proj.weight": "model-00011-of-00014.safetensors",
    "model.layers.5.input_layernorm.weight": "model-00002-of-00014.safetensors",
    "model.layers.5.mlp.down_proj.weight": "model-00002-of-00014.safetensors",
    "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00014.safetensors",
    "model.layers.5.mlp.up_proj.weight": "model-00002-of-00014.safetensors",
    "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00014.safetensors",
    "model.layers.5.self_attn.k_norm.weight": "model-00002-of-00014.safetensors",
    "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00014.safetensors",
    "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00014.safetensors",
    "model.layers.5.self_attn.q_norm.weight": "model-00002-of-00014.safetensors",
    "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00014.safetensors",
    "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00014.safetensors",
    "model.layers.50.input_layernorm.weight": "model-00011-of-00014.safetensors",
    "model.layers.50.mlp.down_proj.weight": "model-00011-of-00014.safetensors",
    "model.layers.50.mlp.gate_proj.weight": "model-00011-of-00014.safetensors",
    "model.layers.50.mlp.up_proj.weight": "model-00011-of-00014.safetensors",
    "model.layers.50.post_attention_layernorm.weight": "model-00011-of-00014.safetensors",
    "model.layers.50.self_attn.k_norm.weight": "model-00011-of-00014.safetensors",
    "model.layers.50.self_attn.k_proj.weight": "model-00011-of-00014.safetensors",
    "model.layers.50.self_attn.o_proj.weight": "model-00011-of-00014.safetensors",
    "model.layers.50.self_attn.q_norm.weight": "model-00011-of-00014.safetensors",
    "model.layers.50.self_attn.q_proj.weight": "model-00011-of-00014.safetensors",
    "model.layers.50.self_attn.v_proj.weight": "model-00011-of-00014.safetensors",
    "model.layers.51.input_layernorm.weight": "model-00011-of-00014.safetensors",
    "model.layers.51.mlp.down_proj.weight": "model-00011-of-00014.safetensors",
    "model.layers.51.mlp.gate_proj.weight": "model-00011-of-00014.safetensors",
    "model.layers.51.mlp.up_proj.weight": "model-00011-of-00014.safetensors",
    "model.layers.51.post_attention_layernorm.weight": "model-00011-of-00014.safetensors",
    "model.layers.51.self_attn.k_norm.weight": "model-00011-of-00014.safetensors",
    "model.layers.51.self_attn.k_proj.weight": "model-00011-of-00014.safetensors",
    "model.layers.51.self_attn.o_proj.weight": "model-00011-of-00014.safetensors",
    "model.layers.51.self_attn.q_norm.weight": "model-00011-of-00014.safetensors",
    "model.layers.51.self_attn.q_proj.weight": "model-00011-of-00014.safetensors",
    "model.layers.51.self_attn.v_proj.weight": "model-00011-of-00014.safetensors",
    "model.layers.52.input_layernorm.weight": "model-00011-of-00014.safetensors",
    "model.layers.52.mlp.down_proj.weight": "model-00011-of-00014.safetensors",
    "model.layers.52.mlp.gate_proj.weight": "model-00011-of-00014.safetensors",
    "model.layers.52.mlp.up_proj.weight": "model-00011-of-00014.safetensors",
    "model.layers.52.post_attention_layernorm.weight": "model-00011-of-00014.safetensors",
    "model.layers.52.self_attn.k_norm.weight": "model-00011-of-00014.safetensors",
    "model.layers.52.self_attn.k_proj.weight": "model-00011-of-00014.safetensors",
    "model.layers.52.self_attn.o_proj.weight": "model-00011-of-00014.safetensors",
    "model.layers.52.self_attn.q_norm.weight": "model-00011-of-00014.safetensors",
    "model.layers.52.self_attn.q_proj.weight": "model-00011-of-00014.safetensors",
    "model.layers.52.self_attn.v_proj.weight": "model-00011-of-00014.safetensors",
    "model.layers.53.input_layernorm.weight": "model-00012-of-00014.safetensors",
    "model.layers.53.mlp.down_proj.weight": "model-00012-of-00014.safetensors",
    "model.layers.53.mlp.gate_proj.weight": "model-00011-of-00014.safetensors",
    "model.layers.53.mlp.up_proj.weight": "model-00012-of-00014.safetensors",
    "model.layers.53.post_attention_layernorm.weight": "model-00012-of-00014.safetensors",
    "model.layers.53.self_attn.k_norm.weight": "model-00011-of-00014.safetensors",
    "model.layers.53.self_attn.k_proj.weight": "model-00011-of-00014.safetensors",
    "model.layers.53.self_attn.o_proj.weight": "model-00011-of-00014.safetensors",
    "model.layers.53.self_attn.q_norm.weight": "model-00011-of-00014.safetensors",
    "model.layers.53.self_attn.q_proj.weight": "model-00011-of-00014.safetensors",
    "model.layers.53.self_attn.v_proj.weight": "model-00011-of-00014.safetensors",
    "model.layers.54.input_layernorm.weight": "model-00012-of-00014.safetensors",
    "model.layers.54.mlp.down_proj.weight": "model-00012-of-00014.safetensors",
    "model.layers.54.mlp.gate_proj.weight": "model-00012-of-00014.safetensors",
    "model.layers.54.mlp.up_proj.weight": "model-00012-of-00014.safetensors",
    "model.layers.54.post_attention_layernorm.weight": "model-00012-of-00014.safetensors",
    "model.layers.54.self_attn.k_norm.weight": "model-00012-of-00014.safetensors",
    "model.layers.54.self_attn.k_proj.weight": "model-00012-of-00014.safetensors",
    "model.layers.54.self_attn.o_proj.weight": "model-00012-of-00014.safetensors",
    "model.layers.54.self_attn.q_norm.weight": "model-00012-of-00014.safetensors",
    "model.layers.54.self_attn.q_proj.weight": "model-00012-of-00014.safetensors",
    "model.layers.54.self_attn.v_proj.weight": "model-00012-of-00014.safetensors",
    "model.layers.55.input_layernorm.weight": "model-00012-of-00014.safetensors",
    "model.layers.55.mlp.down_proj.weight": "model-00012-of-00014.safetensors",
    "model.layers.55.mlp.gate_proj.weight": "model-00012-of-00014.safetensors",
    "model.layers.55.mlp.up_proj.weight": "model-00012-of-00014.safetensors",
    "model.layers.55.post_attention_layernorm.weight": "model-00012-of-00014.safetensors",
    "model.layers.55.self_attn.k_norm.weight": "model-00012-of-00014.safetensors",
    "model.layers.55.self_attn.k_proj.weight": "model-00012-of-00014.safetensors",
    "model.layers.55.self_attn.o_proj.weight": "model-00012-of-00014.safetensors",
    "model.layers.55.self_attn.q_norm.weight": "model-00012-of-00014.safetensors",
    "model.layers.55.self_attn.q_proj.weight": "model-00012-of-00014.safetensors",
    "model.layers.55.self_attn.v_proj.weight": "model-00012-of-00014.safetensors",
    "model.layers.56.input_layernorm.weight": "model-00012-of-00014.safetensors",
    "model.layers.56.mlp.down_proj.weight": "model-00012-of-00014.safetensors",
    "model.layers.56.mlp.gate_proj.weight": "model-00012-of-00014.safetensors",
    "model.layers.56.mlp.up_proj.weight": "model-00012-of-00014.safetensors",
    "model.layers.56.post_attention_layernorm.weight": "model-00012-of-00014.safetensors",
    "model.layers.56.self_attn.k_norm.weight": "model-00012-of-00014.safetensors",
    "model.layers.56.self_attn.k_proj.weight": "model-00012-of-00014.safetensors",
    "model.layers.56.self_attn.o_proj.weight": "model-00012-of-00014.safetensors",
    "model.layers.56.self_attn.q_norm.weight": "model-00012-of-00014.safetensors",
    "model.layers.56.self_attn.q_proj.weight": "model-00012-of-00014.safetensors",
    "model.layers.56.self_attn.v_proj.weight": "model-00012-of-00014.safetensors",
    "model.layers.57.input_layernorm.weight": "model-00012-of-00014.safetensors",
    "model.layers.57.mlp.down_proj.weight": "model-00012-of-00014.safetensors",
    "model.layers.57.mlp.gate_proj.weight": "model-00012-of-00014.safetensors",
    "model.layers.57.mlp.up_proj.weight": "model-00012-of-00014.safetensors",
    "model.layers.57.post_attention_layernorm.weight": "model-00012-of-00014.safetensors",
    "model.layers.57.self_attn.k_norm.weight": "model-00012-of-00014.safetensors",
    "model.layers.57.self_attn.k_proj.weight": "model-00012-of-00014.safetensors",
    "model.layers.57.self_attn.o_proj.weight": "model-00012-of-00014.safetensors",
    "model.layers.57.self_attn.q_norm.weight": "model-00012-of-00014.safetensors",
    "model.layers.57.self_attn.q_proj.weight": "model-00012-of-00014.safetensors",
    "model.layers.57.self_attn.v_proj.weight": "model-00012-of-00014.safetensors",
    "model.layers.58.input_layernorm.weight": "model-00013-of-00014.safetensors",
    "model.layers.58.mlp.down_proj.weight": "model-00013-of-00014.safetensors",
    "model.layers.58.mlp.gate_proj.weight": "model-00012-of-00014.safetensors",
    "model.layers.58.mlp.up_proj.weight": "model-00013-of-00014.safetensors",
    "model.layers.58.post_attention_layernorm.weight": "model-00013-of-00014.safetensors",
    "model.layers.58.self_attn.k_norm.weight": "model-00012-of-00014.safetensors",
    "model.layers.58.self_attn.k_proj.weight": "model-00012-of-00014.safetensors",
    "model.layers.58.self_attn.o_proj.weight": "model-00012-of-00014.safetensors",
    "model.layers.58.self_attn.q_norm.weight": "model-00012-of-00014.safetensors",
    "model.layers.58.self_attn.q_proj.weight": "model-00012-of-00014.safetensors",
    "model.layers.58.self_attn.v_proj.weight": "model-00012-of-00014.safetensors",
    "model.layers.59.input_layernorm.weight": "model-00013-of-00014.safetensors",
    "model.layers.59.mlp.down_proj.weight": "model-00013-of-00014.safetensors",
    "model.layers.59.mlp.gate_proj.weight": "model-00013-of-00014.safetensors",
    "model.layers.59.mlp.up_proj.weight": "model-00013-of-00014.safetensors",
    "model.layers.59.post_attention_layernorm.weight": "model-00013-of-00014.safetensors",
    "model.layers.59.self_attn.k_norm.weight": "model-00013-of-00014.safetensors",
    "model.layers.59.self_attn.k_proj.weight": "model-00013-of-00014.safetensors",
    "model.layers.59.self_attn.o_proj.weight": "model-00013-of-00014.safetensors",
    "model.layers.59.self_attn.q_norm.weight": "model-00013-of-00014.safetensors",
    "model.layers.59.self_attn.q_proj.weight": "model-00013-of-00014.safetensors",
    "model.layers.59.self_attn.v_proj.weight": "model-00013-of-00014.safetensors",
    "model.layers.6.input_layernorm.weight": "model-00002-of-00014.safetensors",
    "model.layers.6.mlp.down_proj.weight": "model-00002-of-00014.safetensors",
    "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00014.safetensors",
    "model.layers.6.mlp.up_proj.weight": "model-00002-of-00014.safetensors",
    "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00014.safetensors",
    "model.layers.6.self_attn.k_norm.weight": "model-00002-of-00014.safetensors",
    "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00014.safetensors",
    "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00014.safetensors",
    "model.layers.6.self_attn.q_norm.weight": "model-00002-of-00014.safetensors",
    "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00014.safetensors",
    "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00014.safetensors",
    "model.layers.60.input_layernorm.weight": "model-00013-of-00014.safetensors",
    "model.layers.60.mlp.down_proj.weight": "model-00013-of-00014.safetensors",
    "model.layers.60.mlp.gate_proj.weight": "model-00013-of-00014.safetensors",
    "model.layers.60.mlp.up_proj.weight": "model-00013-of-00014.safetensors",
    "model.layers.60.post_attention_layernorm.weight": "model-00013-of-00014.safetensors",
    "model.layers.60.self_attn.k_norm.weight": "model-00013-of-00014.safetensors",
    "model.layers.60.self_attn.k_proj.weight": "model-00013-of-00014.safetensors",
    "model.layers.60.self_attn.o_proj.weight": "model-00013-of-00014.safetensors",
    "model.layers.60.self_attn.q_norm.weight": "model-00013-of-00014.safetensors",
    "model.layers.60.self_attn.q_proj.weight": "model-00013-of-00014.safetensors",
    "model.layers.60.self_attn.v_proj.weight": "model-00013-of-00014.safetensors",
    "model.layers.61.input_layernorm.weight": "model-00013-of-00014.safetensors",
    "model.layers.61.mlp.down_proj.weight": "model-00013-of-00014.safetensors",
    "model.layers.61.mlp.gate_proj.weight": "model-00013-of-00014.safetensors",
    "model.layers.61.mlp.up_proj.weight": "model-00013-of-00014.safetensors",
    "model.layers.61.post_attention_layernorm.weight": "model-00013-of-00014.safetensors",
    "model.layers.61.self_attn.k_norm.weight": "model-00013-of-00014.safetensors",
    "model.layers.61.self_attn.k_proj.weight": "model-00013-of-00014.safetensors",
    "model.layers.61.self_attn.o_proj.weight": "model-00013-of-00014.safetensors",
    "model.layers.61.self_attn.q_norm.weight": "model-00013-of-00014.safetensors",
    "model.layers.61.self_attn.q_proj.weight": "model-00013-of-00014.safetensors",
    "model.layers.61.self_attn.v_proj.weight": "model-00013-of-00014.safetensors",
    "model.layers.62.input_layernorm.weight": "model-00013-of-00014.safetensors",
    "model.layers.62.mlp.down_proj.weight": "model-00013-of-00014.safetensors",
    "model.layers.62.mlp.gate_proj.weight": "model-00013-of-00014.safetensors",
    "model.layers.62.mlp.up_proj.weight": "model-00013-of-00014.safetensors",
    "model.layers.62.post_attention_layernorm.weight": "model-00013-of-00014.safetensors",
    "model.layers.62.self_attn.k_norm.weight": "model-00013-of-00014.safetensors",
    "model.layers.62.self_attn.k_proj.weight": "model-00013-of-00014.safetensors",
    "model.layers.62.self_attn.o_proj.weight": "model-00013-of-00014.safetensors",
    "model.layers.62.self_attn.q_norm.weight": "model-00013-of-00014.safetensors",
    "model.layers.62.self_attn.q_proj.weight": "model-00013-of-00014.safetensors",
    "model.layers.62.self_attn.v_proj.weight": "model-00013-of-00014.safetensors",
    "model.layers.63.input_layernorm.weight": "model-00014-of-00014.safetensors",
    "model.layers.63.mlp.down_proj.weight": "model-00014-of-00014.safetensors",
    "model.layers.63.mlp.gate_proj.weight": "model-00013-of-00014.safetensors",
    "model.layers.63.mlp.up_proj.weight": "model-00014-of-00014.safetensors",
    "model.layers.63.post_attention_layernorm.weight": "model-00014-of-00014.safetensors",
    "model.layers.63.self_attn.k_norm.weight": "model-00013-of-00014.safetensors",
    "model.layers.63.self_attn.k_proj.weight": "model-00013-of-00014.safetensors",
    "model.layers.63.self_attn.o_proj.weight": "model-00013-of-00014.safetensors",
    "model.layers.63.self_attn.q_norm.weight": "model-00013-of-00014.safetensors",
    "model.layers.63.self_attn.q_proj.weight": "model-00013-of-00014.safetensors",
    "model.layers.63.self_attn.v_proj.weight": "model-00013-of-00014.safetensors",
    "model.layers.7.input_layernorm.weight": "model-00002-of-00014.safetensors",
    "model.layers.7.mlp.down_proj.weight": "model-00002-of-00014.safetensors",
    "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00014.safetensors",
    "model.layers.7.mlp.up_proj.weight": "model-00002-of-00014.safetensors",
    "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00014.safetensors",
    "model.layers.7.self_attn.k_norm.weight": "model-00002-of-00014.safetensors",
    "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00014.safetensors",
    "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00014.safetensors",
    "model.layers.7.self_attn.q_norm.weight": "model-00002-of-00014.safetensors",
    "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00014.safetensors",
    "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00014.safetensors",
    "model.layers.8.input_layernorm.weight": "model-00003-of-00014.safetensors",
    "model.layers.8.mlp.down_proj.weight": "model-00003-of-00014.safetensors",
    "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00014.safetensors",
    "model.layers.8.mlp.up_proj.weight": "model-00003-of-00014.safetensors",
    "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00014.safetensors",
    "model.layers.8.self_attn.k_norm.weight": "model-00002-of-00014.safetensors",
    "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00014.safetensors",
    "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00014.safetensors",
    "model.layers.8.self_attn.q_norm.weight": "model-00002-of-00014.safetensors",
    "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00014.safetensors",
    "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00014.safetensors",
    "model.layers.9.input_layernorm.weight": "model-00003-of-00014.safetensors",
    "model.layers.9.mlp.down_proj.weight": "model-00003-of-00014.safetensors",
    "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00014.safetensors",
    "model.layers.9.mlp.up_proj.weight": "model-00003-of-00014.safetensors",
    "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00014.safetensors",
    "model.layers.9.self_attn.k_norm.weight": "model-00003-of-00014.safetensors",
    "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00014.safetensors",
    "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00014.safetensors",
    "model.layers.9.self_attn.q_norm.weight": "model-00003-of-00014.safetensors",
    "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00014.safetensors",
    "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00014.safetensors",
    "model.norm.weight": "model-00014-of-00014.safetensors"
  }
 }
--- a/run_summary.json
+++ b/run_summary.json
@@ -0,0 +1,12 @@
 {
  "agent_name": "9828cc7d5cb31c19ed7e6dead76bd24dc2d66262_thinking_preprocessed",
  "training_start": null,
  "training_end": null,
  "created_by": "raoof1",
  "base_model_name": "Qwen/Qwen3-8B",
  "dataset_name": "/e/scratch/jureap59/raoof1/sft_data/hf_hub/datasets--DCAgent--g1_min_episodes_e1_gpt_long_top8_glm47_traces/snapshots/9828cc7d5cb31c19ed7e6dead76bd24dc2d66262_thinking_preprocessed",
  "training_type": "SFT",
  "training_parameters": "https://huggingface.co/DCAgent/g1_gptlong_top8_32b/blob/main/config.json",
  "wandb_link": null,
  "traces_location_s3": null
 }
--- a/special_tokens_map.json
+++ b/special_tokens_map.json
@@ -0,0 +1,31 @@
 {
  "additional_special_tokens": [
    "<|im_start|>",
    "<|im_end|>",
    "<|object_ref_start|>",
    "<|object_ref_end|>",
    "<|box_start|>",
    "<|box_end|>",
    "<|quad_start|>",
    "<|quad_end|>",
    "<|vision_start|>",
    "<|vision_end|>",
    "<|vision_pad|>",
    "<|image_pad|>",
    "<|video_pad|>"
  ],
  "eos_token": {
    "content": "<|im_end|>",
    "lstrip": false,
    "normalized": false,
    "rstrip": false,
    "single_word": false
  },
  "pad_token": {
    "content": "<|endoftext|>",
    "lstrip": false,
    "normalized": false,
    "rstrip": false,
    "single_word": false
  }
 }
--- a/tokenizer.json
+++ b/tokenizer.json
--- a/tokenizer_config.json
+++ b/tokenizer_config.json
@@ -0,0 +1,240 @@
 {
  "add_bos_token": false,
  "add_prefix_space": false,
  "added_tokens_decoder": {
    "151643": {
      "content": "<|endoftext|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151644": {
      "content": "<|im_start|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151645": {
      "content": "<|im_end|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151646": {
      "content": "<|object_ref_start|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151647": {
      "content": "<|object_ref_end|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151648": {
      "content": "<|box_start|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151649": {
      "content": "<|box_end|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151650": {
      "content": "<|quad_start|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151651": {
      "content": "<|quad_end|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151652": {
      "content": "<|vision_start|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151653": {
      "content": "<|vision_end|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151654": {
      "content": "<|vision_pad|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151655": {
      "content": "<|image_pad|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151656": {
      "content": "<|video_pad|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151657": {
      "content": "<tool_call>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151658": {
      "content": "</tool_call>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151659": {
      "content": "<|fim_prefix|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151660": {
      "content": "<|fim_middle|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151661": {
      "content": "<|fim_suffix|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151662": {
      "content": "<|fim_pad|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151663": {
      "content": "<|repo_name|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151664": {
      "content": "<|file_sep|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151665": {
      "content": "<tool_response>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151666": {
      "content": "</tool_response>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151667": {
      "content": "<think>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151668": {
      "content": "</think>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    }
  },
  "additional_special_tokens": [
    "<|im_start|>",
    "<|im_end|>",
    "<|object_ref_start|>",
    "<|object_ref_end|>",
    "<|box_start|>",
    "<|box_end|>",
    "<|quad_start|>",
    "<|quad_end|>",
    "<|vision_start|>",
    "<|vision_end|>",
    "<|vision_pad|>",
    "<|image_pad|>",
    "<|video_pad|>"
  ],
  "bos_token": null,
  "clean_up_tokenization_spaces": false,
  "eos_token": "<|im_end|>",
  "errors": "replace",
  "extra_special_tokens": {},
  "model_max_length": 32768,
  "pad_token": "<|endoftext|>",
  "padding_side": "right",
  "split_special_tokens": false,
  "tokenizer_class": "Qwen2Tokenizer",
  "unk_token": null
 }
--- a/train_results.json
+++ b/train_results.json
@@ -0,0 +1,16 @@
 {
    "achieved_tflops_per_gpu": 0.010354545414379324,
    "achieved_tflops_per_gpu_theoretical": 1519.821207581491,
    "epoch": 5.0,
    "loss_nan_ranks": 0,
    "loss_rank_avg": 0.21731798350811005,
    "mfu_percent": 0.0007317699939490688,
    "mfu_percent_theoretical": 107.40785919303823,
    "total_flos": 9850857283649536.0,
    "train_loss": 0.03974963869139502,
    "train_runtime": 9909.9567,
    "train_samples_per_second": 14.475,
    "train_steps_per_second": 0.151,
    "valid_targets_mean": 1465.8,
    "valid_targets_min": 325
 }
--- a/trainer_state.json
+++ b/trainer_state.json
--- a/training_configs/sft.sbatch
+++ b/training_configs/sft.sbatch
@@ -0,0 +1,366 @@
 #!/bin/bash
 #SBATCH --time=12:00:00
 #SBATCH --nodes=24
 #SBATCH --ntasks-per-node=1
 #SBATCH --cpus-per-task=288
 #SBATCH --output=/e/scratch/jureap59/raoof1/sft_experiments/g1_gptlong_top8_32b/logs/%x_%j.out
 #SBATCH --job-name=g1_gptlong_top8_32b__Qwen3-8B
 #SBATCH --mail-type=END,TIME_LIMIT,FAIL
 #SBATCH --mail-user=
 #SBATCH -p booster
 #SBATCH --account reformo
 #SBATCH --gres=gpu:4
 #SBATCH --exclude=jpbo-031-[01-48]
 # ==============================================================================
 # Universal SFT Training SBATCH Template
 # ==============================================================================
 # This template replaces the cluster-specific *_train.sbatch scripts by delegating
 # all logic to the SFTJobRunner Python class.
 #
 # Usage: The launcher writes a JSON config file and substitutes /e/scratch/jureap59/raoof1/sft_experiments/g1_gptlong_top8_32b/configs/g1_gptlong_top8_32b__Qwen3-8B_sft_config.json
 # ==============================================================================
 source /e/scratch/jureap59/feuer1/miniforge3/etc/profile.d/conda.sh
 conda activate otagent
 set -eo pipefail
 ml purge
 ulimit -c 0  # Disable core dumps to avoid filling disk space
 # Handle bash completion scripts that use BASH_COMPLETION_DEBUG
 if [ -z "${BASH_COMPLETION_DEBUG+x}" ]; then
  export BASH_COMPLETION_DEBUG=""
 fi
 # --- Clean up /tmp to prevent state pollution from previous jobs ---
 # Some HPC systems retain /tmp contents across job allocations on the same node.
 # This can cause issues with tmux sessions, container state, and other temporary files.
 rm -rf /tmp/tmux-* 2>/dev/null || true
 rm -rf /tmp/ray 2>/dev/null || true
 rm -rf /tmp/hf_home 2>/dev/null || true
 rm -rf /tmp/containers 2>/dev/null || true
 rm -rf /tmp/podman-* 2>/dev/null || true
 # Guard conda deactivate scripts from set -u complaints
 export CONDA_BACKUP_CXX="${CONDA_BACKUP_CXX:-}"
 export CONDA_BACKUP_CC="${CONDA_BACKUP_CC:-}"
 export CONDA_BACKUP_FC="${CONDA_BACKUP_FC:-}"
 # --- Module and Conda Setup ---
 # --- Module loading (cluster-specific, substituted by launcher) ---
 module load nvidia-compilers/25.9-CUDA-13
 # --- Environment setup ---
 if [ -n "${DCFT_PRIVATE:-}" ]; then
  WORKDIR="$DCFT_PRIVATE"
 elif [ -n "${DCFT:-}" ]; then
  WORKDIR="$DCFT"
 else
  WORKDIR="$PWD"
 fi
 cd "$WORKDIR"
 if [ -z "${DCFT:-}" ]; then
  export DCFT="$WORKDIR"
 fi
 # --- Conda activation (cluster-specific, substituted by launcher) ---
 # No conda activation configured
 # --- Source environment files ---
 if [ -n "${DCFT:-}" ] && [ -f "$DCFT/hpc/dotenv/jupiter.env" ]; then
  source "$DCFT/hpc/dotenv/jupiter.env"
 fi
 if [ -n "${DC_AGENT_SECRET_ENV:-}" ] && [ -f "$DC_AGENT_SECRET_ENV" ]; then
  set -a
  source "$DC_AGENT_SECRET_ENV"
  set +a
 fi
 if [ -n "${DCFT_ACTIVATE_ENV:-}" ]; then
  eval "$DCFT_ACTIVATE_ENV"
 fi
 # --- CUDA path detection (Perlmutter and similar) ---
 # --- NCCL/Networking settings (cluster-specific) ---
 # Cluster-specific NCCL/networking settings
 export NCCL_DEBUG="INFO"
 export NCCL_NET_GDR_LEVEL="0"
 export NCCL_SOCKET_IFNAME="ib0"
 export NCCL_IB_TIMEOUT="60"
 # --- Cluster-specific environment variables ---
 export WANDB_MODE="offline"
 export GLOO_USE_IPV6="0"
 export NCCL_SOCKET_FAMILY="AF_INET"
 export SKYRL_ENABLE_NUMA_AFFINITY="1"
 export DISABLE_AIOHTTP_TRANSPORT="True"
 export VLLM_ALLREDUCE_USE_SYMM_MEM="0"
 # --- Ray defaults ---
 # --- Ray defaults ---
 export RAY_CGRAPH_get_timeout="${RAY_CGRAPH_get_timeout:-900}"
 # GH200 unified memory: GPU HBM is part of system RAM, so Ray's
 # memory monitor double-counts GPU allocations and kills workers
 # during model loading.  Disable the monitor entirely.
 export RAY_memory_monitor_refresh_ms=0
 if [ -z "${RAY_TMPDIR:-}" ]; then
  RAY_TMPDIR_BASE="/tmp/ray"
  RAY_TMPDIR="${RAY_TMPDIR_BASE}/ray_${SLURM_JOB_ID:-$$}"
  mkdir -p "$RAY_TMPDIR"
 fi
 export RAY_TMPDIR="${RAY_TMPDIR}"
 echo "[ray] RAY_TMPDIR=$RAY_TMPDIR"
 # --- Standard environment variables ---
 export PYTHONFAULTHANDLER=1
 export TORCH_SHOW_CPP_STACKTRACES=1
 export CUDA_LAUNCH_BLOCKING=0
 export TORCH_NCCL_ASYNC_ERROR_HANDLING=1
 export OMP_NUM_THREADS=1
 export PYTHONPATH="$WORKDIR:${PYTHONPATH:-}"
 export DISABLE_VERSION_CHECK=1  # Skip LlamaFactory transformers version check
 # --- Distributed training setup ---
 export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
 # if histname contains jrc or jwb, set master addres to ${MASTER_ADDR}i to use infiniband
 if [[  "$MASTER_ADDR" == *"jrc"* || "$MASTER_ADDR" == *"jwb"* ]]; then
    export MASTER_ADDR="${MASTER_ADDR}i"
 fi
 echo "MASTER_ADDR set to $MASTER_ADDR"
 export MASTER_PORT=12802
 export NUM_NODES=$SLURM_JOB_NUM_NODES
 export NUM_GPUS_PER_NODE=4
 export NUM_GPUS=$((NUM_GPUS_PER_NODE*SLURM_NNODES))
 # --- HuggingFace/WandB paths ---
 export HF_HOME="${HF_HOME:-${HF_HUB_CACHE:-/tmp/hf_home}}"
 export WANDB_DIR="${DCFT_WANDB_DIR:-$DCFT//e/scratch/jureap59/raoof1/sft_experiments/g1_gptlong_top8_32b/wandb}"
 # --- Triton/TorchInductor cache settings (node-local to avoid shared FS issues) ---
 export TRITON_CACHE_VERBOSE=1
 source "$WORKDIR/hpc/shell_utils/triton_cache.sh"
 # --- Create experiment directories ---
 mkdir -p "$DCFT//e/scratch/jureap59/raoof1/sft_experiments/g1_gptlong_top8_32b"
 mkdir -p "$DCFT//e/scratch/jureap59/raoof1/sft_experiments/g1_gptlong_top8_32b/logs"
 mkdir -p "$DCFT//e/scratch/jureap59/raoof1/sft_experiments/g1_gptlong_top8_32b/tmp"
 # --- Supabase environment variables for DB registration ---
 for _supabase_var in SUPABASE_URL SUPABASE_ANON_KEY SUPABASE_SERVICE_ROLE_KEY; do
    if [[ -n "${!_supabase_var:-}" ]]; then
        export "${_supabase_var}=${!_supabase_var}"
    else
        echo "Warning: ${_supabase_var} is not set; Supabase registration may fail." >&2
    fi
 done
 # --- SSH Tunneling (JSC clusters only) ---
 # ============================================================================
 # SSH Tunnel + Proxychains Setup for No-Internet Clusters (JSC)
 #
 # Creates SOCKS5 proxy via SSH tunnel to login node, then uses proxychains
 # to route external traffic through the tunnel.
 #
 # Jupiter (ARM GH200): Uses wrapped binary approach (proxychains4 -f <config> cmd)
 # Other JSC clusters: Uses LD_PRELOAD approach for Ray worker inheritance
 # ============================================================================
 # Determine login node and proxychains paths based on cluster
 NODE_HOST=$(hostname -s)
 PROXYCHAINS_MODE=""  # "binary" or "ldpreload"
 if [[ $NODE_HOST == jrc* ]]; then
    LOGIN_NODE="jrlogin05i"
    PROXYCHAINS_LIB="/p/scratch/synthlaion/dc-agent-shared/tools/proxychains-ng-install/lib/libproxychains4.so"
    PROXYCHAINS_MODE="ldpreload"
 elif [[ $NODE_HOST == jwb* ]]; then
    LOGIN_NODE="jwlogin22i"
    PROXYCHAINS_LIB="/p/scratch/synthlaion/dc-agent-shared/tools/proxychains-ng-install/lib/libproxychains4.so"
    PROXYCHAINS_MODE="ldpreload"
 elif [[ $NODE_HOST == jpb* ]] || [[ $NODE_HOST == jpc* ]]; then
    LOGIN_NODE="jpbl-s01-01"
    # Jupiter uses aarch64 build - binary wrapper approach (LD_PRELOAD doesn't work reliably)
    PROXYCHAINS_BIN="/e/scratch/jureap59/feuer1/proxychains-ng-aarch64/bin/proxychains4"
    PROXYCHAINS_MODE="binary"
 elif [[ $NODE_HOST == lrdn* ]] || [[ $NODE_HOST == *.leonardo.local ]]; then
    LOGIN_NODE="login05-ext.leonardo.cineca.it"
    # Leonardo uses x86 build - binary wrapper approach
    PROXYCHAINS_BIN="/leonardo/home/userexternal/bfeuer00/proxychains/bin/proxychains4"
    PROXYCHAINS_MODE="binary"
 else
    echo "[proxy] Unknown cluster for node $NODE_HOST - skipping proxy setup"
    return 0
 fi
 TUNNEL_PORT=7003
 # Check if proxychains is available
 if [[ "$PROXYCHAINS_MODE" == "binary" ]]; then
    if [ ! -x "$PROXYCHAINS_BIN" ]; then
        echo "[proxy] ✗ proxychains binary not found at $PROXYCHAINS_BIN"
        echo "[proxy] Skipping proxy setup - external connectivity will fail"
        return 0
    fi
    echo "[proxy] ✓ Found proxychains binary at $PROXYCHAINS_BIN"
 else
    if [ ! -f "$PROXYCHAINS_LIB" ]; then
        echo "[proxy] ✗ proxychains library not found at $PROXYCHAINS_LIB"
        echo "[proxy] Skipping proxy setup - external connectivity will fail"
        return 0
    fi
    echo "[proxy] ✓ Found proxychains library at $PROXYCHAINS_LIB"
 fi
 if [ -z "${SSH_KEY:-}" ]; then
    echo "[proxy] SSH_KEY not set - skipping proxy setup"
    echo "[proxy] Set SSH_KEY in your environment to enable internet access"
 else
    # Get this node's IP address for multi-node proxy access
    NODE_IP=$(nslookup $NODE_HOST | grep 'Address' | tail -n1 | awk '{print $2}')
    echo "[proxy] Setting up SSH tunnel to $LOGIN_NODE"
    echo "[proxy] SSH key: $SSH_KEY"
    echo "[proxy] Tunnel port: $TUNNEL_PORT"
    echo "[proxy] Node IP: $NODE_IP (workers will connect here)"
    # Create SSH tunnel with SOCKS5 proxy
    # -g flag allows remote hosts (worker nodes) to connect to the tunnel
    ssh -g -f -N -D ${TUNNEL_PORT} \
        -o StrictHostKeyChecking=no \
        -o ConnectTimeout=1000 \
        -o ServerAliveInterval=10 \
        -o ServerAliveCountMax=30 \
        -o TCPKeepAlive=yes \
        -o ExitOnForwardFailure=yes \
        -o BatchMode=yes \
        -i ${SSH_KEY} \
        ${USER}@${LOGIN_NODE}
    # Give tunnel time to establish
    sleep 5
    # Verify tunnel is running
    if pgrep -f "ssh.*-D.*${TUNNEL_PORT}" > /dev/null; then
        echo "[proxy] ✓ SSH tunnel started successfully"
    else
        echo "[proxy] ✗ SSH tunnel failed to start"
        return 0
    fi
    # ============================================================================
    # Generate proxychains config
    # Key: Uses NODE_IP (not localhost) so worker nodes can access the tunnel
    # localnet entries ensure internal traffic (Ray, NCCL) bypasses proxy
    # ============================================================================
    SLURM_JOB_ID=${SLURM_JOB_ID:-"local"}
    CFG_PATH=~/.proxychains/proxychains_${SLURM_JOB_ID}.conf
    mkdir -p ~/.proxychains
    cat > "$CFG_PATH" <<PCEOF
 strict_chain
 quiet_mode
 tcp_read_time_out 30000
 tcp_connect_time_out 15000
 localnet 127.0.0.0/255.0.0.0
 localnet 127.0.0.1/255.255.255.255
 localnet 10.0.0.0/255.0.0.0
 localnet 172.16.0.0/255.240.0.0
 localnet 192.168.0.0/255.255.0.0
 localnet 169.254.0.0/255.255.0.0
 [ProxyList]
 socks5 ${NODE_IP} ${TUNNEL_PORT}
 PCEOF
    echo "[proxy] ✓ Generated proxychains config at $CFG_PATH"
    echo "[proxy]   - Internal traffic (10.x.x.x, 172.x.x.x, 169.254.x.x) → DIRECT"
    echo "[proxy]   - External traffic (internet) → PROXY via tunnel"
    # ============================================================================
    # Export proxychains configuration based on mode
    # ============================================================================
    export PROXYCHAINS_CONF_FILE="$CFG_PATH"
    export PROXYCHAINS_SOCKS5_HOST="${NODE_IP}"
    export PROXYCHAINS_SOCKS5_PORT="${TUNNEL_PORT}"
    # if [[ "$PROXYCHAINS_MODE" == "binary" ]]; then
    #     # Binary wrapper approach (Jupiter ARM GH200)
    #     # Ray workers will use: proxychains4 -f $PROXYCHAINS_CONF_FILE ray start ...
    #     export PROXYCHAINS_BINARY="$PROXYCHAINS_BIN"
    #     echo "[proxy] ✓ PROXYCHAINS_BINARY=$PROXYCHAINS_BIN"
    #     echo "[proxy] ✓ PROXYCHAINS_CONF_FILE=$CFG_PATH"
    #     echo "[proxy] ✓ PROXYCHAINS_SOCKS5_HOST=${NODE_IP} (accessible from worker nodes)"
    #     echo "[proxy] ✓ PROXYCHAINS_SOCKS5_PORT=${TUNNEL_PORT}"
    # else
    #     # LD_PRELOAD approach (Jureca, Juwels)
    #     # Ray workers inherit proxy via LD_PRELOAD environment variable
    #     export LD_PRELOAD="$PROXYCHAINS_LIB"
    #     echo "[proxy] ✓ LD_PRELOAD set to $PROXYCHAINS_LIB"
    #     echo "[proxy] ✓ PROXYCHAINS_CONF_FILE=$CFG_PATH"
    #     echo "[proxy] ✓ PROXYCHAINS_SOCKS5_HOST=${NODE_IP} (accessible from worker nodes)"
    #     echo "[proxy] ✓ PROXYCHAINS_SOCKS5_PORT=${TUNNEL_PORT}"
    # fi
    # ============================================================================
    # Daytona/aiohttp timeout and retry settings
    # ============================================================================
    export DAYTONA_MAX_RETRIES=5
    export DAYTONA_RETRY_DELAY=30
    export DAYTONA_BACKOFF_FACTOR=2
    export DAYTONA_TIMEOUT=1800  # 30 minutes
    export AIOHTTP_CLIENT_TIMEOUT=900  # 15 minutes
    export AIOHTTP_CONNECTOR_TIMEOUT=900
    export AIOHTTP_SOCK_CONNECT_TIMEOUT=300
    export AIOHTTP_TOTAL_TIMEOUT=1800
    # Disable SSL verification (JSC certificate issues)
    export PYTHONHTTPSVERIFY=0
    unset SSL_CERT_FILE
    unset CURL_CA_BUNDLE
    unset REQUESTS_CA_BUNDLE
    unset SSL_CERT_DIR
    echo "[proxy] ✓ Daytona timeout settings configured"
    # Test proxy connectivity
    echo "[proxy] Testing proxy connectivity..."
    if [[ "$PROXYCHAINS_MODE" == "binary" ]]; then
        if "$PROXYCHAINS_BIN" -f "$CFG_PATH" curl -s --connect-timeout 10 https://huggingface.co -o /dev/null; then
            echo "[proxy] ✓ Proxy connectivity test passed (huggingface.co reachable via wrapped binary)"
        else
            echo "[proxy] ⚠ Proxy connectivity test failed (may still work for Daytona)"
        fi
    else
        if curl -s --connect-timeout 10 https://huggingface.co -o /dev/null 2>/dev/null; then
            echo "[proxy] ✓ Proxy connectivity test passed (huggingface.co reachable via LD_PRELOAD)"
        else
            echo "[proxy] ⚠ Proxy connectivity test failed (may still work for Daytona)"
        fi
    fi
    # Test that tunnel is accessible from this node's IP (for worker node access)
    if nc -z ${NODE_IP} ${TUNNEL_PORT} 2>/dev/null; then
        echo "[proxy] ✓ Tunnel accessible at ${NODE_IP}:${TUNNEL_PORT} (workers can connect)"
    else
        echo "[proxy] ⚠ Tunnel not accessible at ${NODE_IP}:${TUNNEL_PORT} (workers may fail)"
    fi
    if [[ "$PROXYCHAINS_MODE" == "binary" ]]; then
        echo "[proxy] ✓ Proxy setup complete (using wrapped binary for Ray workers)"
    else
        echo "[proxy] ✓ Proxy setup complete (using LD_PRELOAD for Ray worker inheritance)"
    fi
 fi
 # --- Run the SFT job via Python runner ---
 echo "=== Universal SFT Training Runner ==="
 echo "Config: /e/scratch/jureap59/raoof1/sft_experiments/g1_gptlong_top8_32b/configs/g1_gptlong_top8_32b__Qwen3-8B_sft_config.json"
 echo "Working directory: $WORKDIR"
 echo "Nodes: $NUM_NODES, GPUs/node: $NUM_GPUS_PER_NODE"
 echo "======================================"
 echo LD_LIBRARY_PATH=$LD_LIBRARY_PATH
 srun --mpi=none --nodes=24 $PROXY_CMD bash -c 'python -m hpc.sft_launch_utils --config "/e/scratch/jureap59/raoof1/sft_experiments/g1_gptlong_top8_32b/configs/g1_gptlong_top8_32b__Qwen3-8B_sft_config.json"'
--- a/training_configs/sft_config.json
+++ b/training_configs/sft_config.json
@@ -0,0 +1,15 @@
 {
  "job_name": "g1_gptlong_top8_32b__Qwen3-8B",
  "train_config_path": "/e/scratch/jureap59/raoof1/sft_experiments/g1_gptlong_top8_32b/configs/g1_gptlong_top8_32b__Qwen3-8B_train_config.yaml",
  "experiments_dir": "/e/scratch/jureap59/raoof1/sft_experiments/g1_gptlong_top8_32b",
  "cluster_name": "jupiter",
  "num_nodes": 24,
  "gpus_per_node": 4,
  "cpus_per_node": 288,
  "launcher": "accelerate",
  "accelerate_config_path": null,
  "deepspeed_config": "sft/lf_configs/deepspeed/ds_z3_accelerate.json",
  "master_port": 12802,
  "needs_ssh_tunnel": true,
  "needs_cuda_detection": false
 }
--- a/training_configs/train_config.yaml
+++ b/training_configs/train_config.yaml
@@ -0,0 +1,52 @@
 adam_beta2: 0.98
 assistant_tag: assistant
 attn: fa2
 bf16: true
 content_tag: content
 cutoff_len: 32768
 dataloader_num_workers: 4
 dataloader_persistent_workers: true
 dataloader_pin_memory: true
 dataset: /e/scratch/jureap59/raoof1/sft_data/hf_hub/datasets--DCAgent--g1_min_episodes_e1_gpt_long_top8_glm47_traces/snapshots/9828cc7d5cb31c19ed7e6dead76bd24dc2d66262_thinking_preprocessed
 dataset_dir: ONLINE
 datasets_cache_dir: /e/scratch/jureap59/raoof1/sft_data/arrow_cache
 ddp_timeout: 180000000
 deepspeed: sft/lf_configs/deepspeed/ds_z3_accelerate.json
 do_train: true
 enable_liger_kernel: true
 finetuning_type: full
 formatting: sharegpt
 gradient_accumulation_steps: 1
 gradient_checkpointing: true
 hub_model_id: DCAgent/g1_gptlong_top8_32b
 include_mfu: true
 learning_rate: 4.0e-05
 load_best_model_at_end: false
 logging_steps: 5
 logging_strategy: steps
 lr_scheduler_type: cosine
 max_grad_norm: 0.001
 messages: conversations
 model_name_or_path: /e/scratch/jureap59/raoof1/sft_data/hf_hub/models--Qwen--Qwen3-32B/snapshots/9216db5781bf21249d130ec9da846c4624c16137
 num_train_epochs: 5.0
 optim: adamw_torch_fused
 output_dir: /e/scratch/jureap59/raoof1/sft_data/checkpoints/sft_g1_gptlong_top8_32b__Qwen3-32B
 overwrite_cache: true
 per_device_train_batch_size: 1
 plot_loss: true
 preprocessing_num_workers: 16
 pure_bf16: false
 push_to_hub: false
 role_tag: role
 run_name: g1_gptlong_top8_32b__Qwen3-8B
 save_steps: 300
 save_strategy: steps
 save_total_limit: 1
 seed: 42
 stage: sft
 template: qwen3
 trust_remote_code: true
 user_tag: user
 warmup_ratio: 0.1
 weight_decay: 0.04
 disable_shuffling: true
--- a/vocab.json
+++ b/vocab.json