commit 8f1663e427742edd683ebae1bbe538b21ba83ecb
Author: ModelHub XC <noreply@modelhub.org.cn>
Date:   Thu May 14 05:33:56 2026 +0800

    初始化项目，由ModelHub XC社区提供模型
    
    Model: FutureMa/Qwen3-8B-Drama-Thinking
    Source: Original Platform

diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000..dafc673
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,56 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+ 
+ 
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zstandard filter=lfs diff=lfs merge=lfs -text
+*.tfevents* filter=lfs diff=lfs merge=lfs -text
+*.db* filter=lfs diff=lfs merge=lfs -text
+*.ark* filter=lfs diff=lfs merge=lfs -text
+**/*ckpt*data* filter=lfs diff=lfs merge=lfs -text
+**/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text
+**/*ckpt*.index filter=lfs diff=lfs merge=lfs -text
+ 
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.gguf* filter=lfs diff=lfs merge=lfs -text
+*.ggml filter=lfs diff=lfs merge=lfs -text
+*.llamafile* filter=lfs diff=lfs merge=lfs -text
+*.pt2 filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+
+model-00002-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text
+model-00004-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text
+model-00003-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text
+training_args.bin filter=lfs diff=lfs merge=lfs -text
+vocab.json filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text
+merges.txt filter=lfs diff=lfs merge=lfs -text
+model-00001-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..0be9a2b
--- /dev/null
+++ b/README.md
@@ -0,0 +1,354 @@
+---
+license: apache-2.0
+base_model: Qwen/Qwen3-8B
+tags:
+  - qwen3
+  - thinking
+  - creative-writing
+  - screenwriting
+  - drama
+  - chain-of-thought
+  - reasoning
+  - ms-swift
+  - full-parameter-finetuning
+datasets:
+  - custom-drama-thinking-dataset
+language:
+  - en
+  - zh
+library_name: transformers
+pipeline_tag: text-generation
+model-index:
+  - name: Qwen3-8B-Drama-Thinking
+    results:
+      - task:
+          type: text-generation
+          name: Creative Script Writing
+        metrics:
+          - type: thinking_depth
+            value: 9.0
+            name: Thinking Depth Score
+          - type: script_format
+            value: 9.0
+            name: Script Format Score
+          - type: dramatic_craft
+            value: 8.5
+            name: Dramatic Craft Score
+---
+
+# Qwen3-8B-Drama-Thinking
+
+This model is a **full parameter fine-tuned** version of [Qwen/Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B) on a custom drama thinking dataset with explicit creative reasoning chains.
+
+## Model Description
+
+- **Base Model**: Qwen3-8B (8 billion parameters)
+- **Training Method**: Full Parameter Fine-tuning (NOT LoRA)
+- **Training Framework**: [ms-swift](https://github.com/modelscope/ms-swift)
+- **Training Data**: Custom Drama Thinking Dataset (6,319 samples, avg ~5,000 tokens)
+- **Specialization**: Screenwriting with explicit `<think>...</think>` creative reasoning
+- **Hardware**: 2x NVIDIA H100 80GB SXM5
+- **Training Time**: 2 hours 46 minutes (3 epochs)
+- **Training Cost**: ~$17.86
+
+## Key Features
+
+### 🎬 Professional Screenwriting Assistant
+
+This model generates dramatic scripts with **explicit creative deliberation**:
+
+- ✅ **Thinking Process Visible**: Uses `<think>...</think>` tags to show internal reasoning
+- ✅ **Deep Character Psychology**: Analyzes motivations, defense mechanisms, subtext
+- ✅ **Structural Planning**: Three-act structure, emotional arcs, pacing decisions
+- ✅ **Visual Storytelling**: Symbolism, atmosphere, cinematographic choices
+- ✅ **Professional Format**: Correct screenplay formatting (scene headers, action lines, dialogue)
+
+### 📊 Performance Comparison
+
+Compared to base Qwen3-8B:
+
+| Metric | Base Model | Fine-Tuned | Improvement |
+|--------|------------|------------|-------------|
+| **Output Length** | 1,071 tokens | 3,874 tokens | **+262%** |
+| **Thinking Depth** | 5/10 | 9/10 | **+80%** |
+| **Creative Reasoning** | 500 tokens | 3,400 tokens | **+580%** |
+| **Craft Analysis** | Generic | Professional | **Qualitative leap** |
+
+### 🎯 Unique Value Proposition
+
+> This is not just a text generator - it's a **creative thinking partner** that externalizes
+> the entire screenwriting process: from title analysis to character psychology to structural
+> planning to final execution.
+
+## Training Details
+
+### Training Configuration
+
+```bash
+Model:              Qwen/Qwen3-8B
+Template:           qwen3_thinking
+Training Type:      Full Parameter (all 8B parameters)
+Max Length:         8192 tokens (for long thinking chains)
+Batch Size:         1 per device × 2 GPUs
+Gradient Accum:     8 steps (effective batch size: 16)
+Learning Rate:      1e-5
+Epochs:             3
+Optimization:       DeepSpeed Zero3 + Gradient Checkpointing
+                    Liger Kernel, BF16 mixed precision
+Loss Scale:         ignore_empty_think
+GPU Memory:         ~74.62 GB per H100 (stable)
+```
+
+### Dataset Characteristics
+
+- **Samples**: 6,319 dramatic script continuations
+- **Average Length**: ~5,000 tokens per sample
+- **Max Length**: ~6,100 tokens
+- **Format**: Conversations with `<think>...</think>` reasoning tags
+- **Content**:
+  - Script opening scenes (title, description, initial dialogue)
+  - Extensive creative deliberation (3,000+ tokens of thinking)
+  - Script continuation with proper formatting
+- **Style**: Dramatic, emotionally intense scenarios (conflicts, reconciliation, tragedy)
+
+### Training Metrics
+
+- **Final Loss**: 0.844
+- **Average Loss**: 0.978
+- **Loss Trajectory**: 1.602 (start) → 0.82-0.83 (end)
+- **Training Speed**: ~8 seconds/iteration
+- **Total Steps**: 1,185
+- **Checkpoints**: 5 saved (400, 800, 900, 1000, 1185)
+
+## Usage
+
+### Quick Start (ms-swift)
+
+```bash
+# Install ms-swift
+pip install ms-swift
+
+# Inference (interactive mode)
+swift infer \
+    --ckpt_dir FutureMa/Qwen3-8B-Drama-Thinking \
+    --template qwen3_thinking \
+    --max_new_tokens 4096 \
+    --temperature 0.7
+```
+
+### Python API
+
+```python
+import os
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+
+from swift.llm import PtEngine, InferRequest, RequestConfig
+
+# Initialize engine
+engine = PtEngine(
+    model_id_or_path="FutureMa/Qwen3-8B-Drama-Thinking",
+    max_batch_size=1,
+    torch_dtype="bfloat16"
+)
+
+# Create prompt
+prompt = """Title: The Reunion
+Description: Two estranged siblings meet at their childhood home after 20 years.
+
+INT. FAMILY LIVING ROOM - DAY
+
+SARAH (35) stands by the window, looking out at the garden.
+MICHAEL (38) enters, hesitant."""
+
+# Generate (streaming)
+messages = [{'role': 'user', 'content': prompt}]
+request = InferRequest(messages=messages)
+config = RequestConfig(max_tokens=4096, temperature=0.7, stream=True)
+
+for response in engine.infer([request], config)[0]:
+    if response:
+        print(response.choices[0].delta.content, end='', flush=True)
+```
+
+### Using with Transformers (requires adapting checkpoint)
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model = AutoModelForCausalLM.from_pretrained(
+    "FutureMa/Qwen3-8B-Drama-Thinking",
+    torch_dtype="bfloat16",
+    device_map="auto"
+)
+
+tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-8B")
+
+messages = [
+    {"role": "system", "content": "You are a creative screenwriter assistant specialized in dramatic storytelling with internal reasoning."},
+    {"role": "user", "content": "Write a scene about forgiveness..."}
+]
+
+text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+inputs = tokenizer([text], return_tensors="pt").to(model.device)
+
+outputs = model.generate(**inputs, max_new_tokens=4096, temperature=0.7)
+print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+```
+
+## Example Output
+
+**Input Prompt:**
+```
+Title: The Last Apology
+Description: A daughter arrives at her estranged father's deathbed...
+```
+
+**Model Output** (excerpt):
+```
+<think>
+Okay. Let's go. I'm sitting here with the opening of *The Last Apology* fresh in my mind,
+and I need to figure out exactly where this goes next—not just the next scene, but the next
+emotional beat, the next structural pivot...
+
+First, the title: *The Last Apology*. That's not just poetic—it's structural. It tells me
+this story is about delayed recognition, about the finality of words left unsaid...
+
+[3,400 tokens of deep creative analysis including:]
+- Title deconstruction and thematic implications
+- Character psychology analysis
+- Three-act structure planning
+- Visual language and symbolism
+- Multiple narrative paths considered
+- Professional screenwriting techniques
+</think>
+
+INT. HOSPITAL ROOM - NIGHT
+
+ANNA (28), in a wrinkled business suit, hesitates at the doorway.
+
+DAVID (65) lies in bed, breathing labored...
+
+[Script continues with proper formatting]
+```
+
+## Intended Use
+
+### ✅ Recommended Use Cases
+
+1. **Screenwriting Education**: Learn professional creative thinking process
+2. **Script Ideation**: Generate story frameworks and narrative alternatives
+3. **Story Consulting**: Explore "what if" scenarios with explicit reasoning
+4. **Creative Brainstorming**: Understand decision-making in storytelling
+5. **Draft Development**: Plan structure before execution
+
+### ❌ Not Recommended For
+
+1. **Final Shooting Scripts**: Requires human refinement for production
+2. **Comedy/Action Genres**: Training bias toward dramatic content
+3. **Long-form Series**: Single-pass generation may lack consistency
+4. **Immediate Production**: Dialogue needs naturalization
+
+## Evaluation Results
+
+### Quantitative Metrics (vs. Base Model)
+
+| Aspect | Score | Base Model | Improvement |
+|--------|-------|------------|-------------|
+| **Thinking Depth** | 9/10 | 5/10 | +80% |
+| **Script Format** | 9/10 | 8/10 | +13% |
+| **Dramatic Craft** | 8.5/10 | 8/10 | +6% |
+| **Character Psychology** | 9/10 | 6/10 | +50% |
+| **Decision Transparency** | 9/10 | 5/10 | +80% |
+| **Overall** | 8.1/10 | 6.9/10 | +17% |
+
+> **Note on Methodology:**
+> *These metrics are generated using an **LLM-as-a-Judge** framework (Claude) comparing the fine-tuned model against the base model.
+
+### Qualitative Improvements
+
+- ✅ **Professional Voice**: Sounds like experienced screenwriter
+- ✅ **Structural Thinking**: Explicit three-act planning
+- ✅ **Meta-Awareness**: "This isn't just a script. It's a reckoning."
+- ✅ **Non-Linear Reasoning**: Considers alternatives, backtracks, refines
+- ✅ **Craft-Oriented**: Explains why choices serve the story
+
+## Limitations
+
+1. **Thinking Verbosity**: Generates ~3,400 tokens of thinking (87% of output)
+   - May be excessive for quick tasks
+   - Consider using `max_new_tokens` to limit length
+
+2. **Incomplete Execution**: Token budget consumed by thinking
+   - Many planned scenes not fully generated
+   - May need 6,000-8,000 token limit for complete scripts
+
+3. **Dialogue Naturalness**: More direct/literary than conversational
+   - Training data style influences output
+   - May need post-processing for natural speech
+
+4. **Training Data Bias**: Skews toward melodramatic scenarios
+   - Less suited for subtle/realistic dialogue
+   - Best for emotionally intense stories
+
+## Training Insights
+
+### What Made This Successful
+
+1. **8192 Token Context**: Essential for capturing full thinking chains
+   - Initial assumption of 2048 would have truncated data
+   - Average sample length: ~5,000 tokens
+
+2. **DeepSpeed Zero3**: Required (not optional)
+   - Single H100: Would need ~109-114 GB (OOM)
+   - Zero3 sharding: ~74.62 GB per card ✅
+
+3. **Full Parameter Training**: Worth the cost
+   - Deeper capability transfer than LoRA
+   - Better thinking process internalization
+   - Cost: $17.86 (2.8 hours) vs ~$5 for LoRA
+
+4. **Quality Training Data**: 6,319 long-form reasoning examples
+   - Actual creative process in `<think>` tags
+   - High-quality dramatic writing
+
+## Citation
+
+```bibtex
+@misc{qwen3-drama-thinking-2025,
+  author = {FutureMa},
+  title = {Qwen3-8B-Drama-Thinking: Full Parameter Fine-tuning for Creative Screenwriting},
+  year = {2025},
+  publisher = {HuggingFace},
+  howpublished = {\url{https://huggingface.co/FutureMa/Qwen3-8B-Drama-Thinking}},
+  note = {Full parameter fine-tuning on 6,319 drama samples with explicit reasoning chains}
+}
+```
+
+## News & Updates
+
+**[2025-12-23]** 🎉 **DramaBench Dataset is now open-source!** Evaluate your drama script generation with our comprehensive 6-dimensional benchmark framework (Format Standards, Narrative Efficiency, Character Consistency, Emotional Depth, Logic Consistency, Conflict Handling).
+- 📊 Dataset: [FutureMa/DramaBench](https://huggingface.co/datasets/FutureMa/DramaBench)
+- 📄 Paper: [arXiv:2512.19012](https://arxiv.org/abs/2512.19012)
+- 🌐 Demo: [dramabench.pages.dev](https://dramabench.pages.dev/)
+
+---
+
+## Acknowledgments
+
+- **Base Model**: [Qwen Team](https://huggingface.co/Qwen) - Qwen3-8B
+- **Training Framework**: [ms-swift](https://github.com/modelscope/ms-swift) - ModelScope SWIFT
+- **Infrastructure**: [Lambda Cloud](https://lambdalabs.com/) - 2x H100 80GB SXM5
+- **Dataset**: Custom Drama Thinking Dataset (6,319 samples)
+
+## Model Card Contact
+
+For questions or feedback:
+- **HuggingFace**: [@FutureMa](https://huggingface.co/FutureMa)
+- **GitHub Issues**: Report via ms-swift repository
+
+---
+
+**Training Date**: 2025-12-08
+**Training Duration**: 2h 46m
+**Model Size**: ~16GB (BF16 precision)
+**Recommended VRAM**: 16GB+ for inference
\ No newline at end of file
diff --git a/added_tokens.json b/added_tokens.json
new file mode 100644
index 0000000..b54f913
--- /dev/null
+++ b/added_tokens.json
@@ -0,0 +1,28 @@
+{
+  "</think>": 151668,
+  "</tool_call>": 151658,
+  "</tool_response>": 151666,
+  "<think>": 151667,
+  "<tool_call>": 151657,
+  "<tool_response>": 151665,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}
diff --git a/args.json b/args.json
new file mode 100644
index 0000000..6a5639a
--- /dev/null
+++ b/args.json
@@ -0,0 +1,393 @@
+{
+  "output_dir": "/home/ubuntu/output/qwen3-8b-drama-thinking-full/v2-20251208-055020",
+  "overwrite_output_dir": false,
+  "do_train": false,
+  "do_eval": false,
+  "do_predict": false,
+  "eval_strategy": "no",
+  "prediction_loss_only": false,
+  "per_device_train_batch_size": 1,
+  "per_device_eval_batch_size": 1,
+  "per_gpu_train_batch_size": null,
+  "per_gpu_eval_batch_size": null,
+  "gradient_accumulation_steps": 8,
+  "eval_accumulation_steps": null,
+  "eval_delay": 0,
+  "torch_empty_cache_steps": null,
+  "learning_rate": 1e-05,
+  "weight_decay": 0.1,
+  "adam_beta1": 0.9,
+  "adam_beta2": 0.95,
+  "adam_epsilon": 1e-08,
+  "max_grad_norm": 1.0,
+  "num_train_epochs": 3.0,
+  "max_steps": -1,
+  "lr_scheduler_type": "cosine",
+  "lr_scheduler_kwargs": null,
+  "warmup_ratio": 0.05,
+  "warmup_steps": 0,
+  "log_level": "passive",
+  "log_level_replica": "warning",
+  "log_on_each_node": true,
+  "logging_dir": "/home/ubuntu/output/qwen3-8b-drama-thinking-full/logs",
+  "logging_strategy": "steps",
+  "logging_first_step": true,
+  "logging_steps": 5,
+  "logging_nan_inf_filter": true,
+  "save_strategy": "steps",
+  "save_steps": 100.0,
+  "save_total_limit": 3,
+  "save_safetensors": true,
+  "save_on_each_node": false,
+  "save_only_model": true,
+  "restore_callback_states_from_checkpoint": false,
+  "no_cuda": false,
+  "use_cpu": false,
+  "use_mps_device": false,
+  "seed": 42,
+  "data_seed": 42,
+  "jit_mode_eval": false,
+  "bf16": true,
+  "fp16": false,
+  "fp16_opt_level": "O1",
+  "half_precision_backend": "auto",
+  "bf16_full_eval": false,
+  "fp16_full_eval": false,
+  "tf32": null,
+  "local_rank": 0,
+  "ddp_backend": null,
+  "tpu_num_cores": null,
+  "tpu_metrics_debug": false,
+  "debug": null,
+  "dataloader_drop_last": false,
+  "eval_steps": 50.0,
+  "dataloader_num_workers": 4,
+  "dataloader_prefetch_factor": null,
+  "past_index": -1,
+  "run_name": "/home/ubuntu/output/qwen3-8b-drama-thinking-full/v2-20251208-055020",
+  "disable_tqdm": null,
+  "remove_unused_columns": true,
+  "label_names": null,
+  "load_best_model_at_end": false,
+  "metric_for_best_model": "loss",
+  "greater_is_better": false,
+  "ignore_data_skip": false,
+  "fsdp": null,
+  "fsdp_min_num_params": 0,
+  "fsdp_config": null,
+  "fsdp_transformer_layer_cls_to_wrap": null,
+  "accelerator_config": {
+    "dispatch_batches": false
+  },
+  "parallelism_config": null,
+  "deepspeed": {
+    "fp16": {
+      "enabled": "auto",
+      "loss_scale": 0,
+      "loss_scale_window": 1000,
+      "initial_scale_power": 16,
+      "hysteresis": 2,
+      "min_loss_scale": 1
+    },
+    "bf16": {
+      "enabled": "auto"
+    },
+    "zero_optimization": {
+      "stage": 3,
+      "offload_optimizer": {
+        "device": "none",
+        "pin_memory": true
+      },
+      "offload_param": {
+        "device": "none",
+        "pin_memory": true
+      },
+      "overlap_comm": false,
+      "contiguous_gradients": true,
+      "sub_group_size": 1000000000.0,
+      "reduce_bucket_size": "auto",
+      "zero_quantized_weights": false,
+      "zero_quantized_gradients": false,
+      "stage3_prefetch_bucket_size": "auto",
+      "stage3_param_persistence_threshold": "auto",
+      "stage3_max_live_parameters": 1000000000.0,
+      "stage3_max_reuse_distance": 1000000000.0,
+      "stage3_gather_16bit_weights_on_model_save": true
+    },
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+  },
+  "label_smoothing_factor": 0.0,
+  "optim": "adamw_torch_fused",
+  "optim_args": null,
+  "adafactor": false,
+  "group_by_length": false,
+  "length_column_name": "length",
+  "report_to": [
+    "tensorboard"
+  ],
+  "project": "huggingface",
+  "trackio_space_id": "trackio",
+  "ddp_find_unused_parameters": null,
+  "ddp_bucket_cap_mb": null,
+  "ddp_broadcast_buffers": null,
+  "dataloader_pin_memory": true,
+  "dataloader_persistent_workers": false,
+  "skip_memory_metrics": true,
+  "use_legacy_prediction_loop": false,
+  "push_to_hub": false,
+  "resume_from_checkpoint": null,
+  "hub_model_id": null,
+  "hub_strategy": "every_save",
+  "hub_token": null,
+  "hub_private_repo": null,
+  "hub_always_push": false,
+  "hub_revision": null,
+  "gradient_checkpointing": true,
+  "gradient_checkpointing_kwargs": "{\"use_reentrant\": false}",
+  "include_inputs_for_metrics": false,
+  "include_for_metrics": [],
+  "eval_do_concat_batches": true,
+  "fp16_backend": "auto",
+  "push_to_hub_model_id": null,
+  "push_to_hub_organization": null,
+  "push_to_hub_token": null,
+  "mp_parameters": "",
+  "auto_find_batch_size": false,
+  "full_determinism": false,
+  "torchdynamo": null,
+  "ray_scope": "last",
+  "ddp_timeout": 18000000,
+  "torch_compile": false,
+  "torch_compile_backend": null,
+  "torch_compile_mode": null,
+  "include_tokens_per_second": false,
+  "include_num_input_tokens_seen": false,
+  "neftune_noise_alpha": null,
+  "optim_target_modules": null,
+  "batch_eval_metrics": false,
+  "eval_on_start": false,
+  "use_liger_kernel": true,
+  "liger_kernel_config": null,
+  "eval_use_gather_object": false,
+  "average_tokens_across_devices": true,
+  "sortish_sampler": false,
+  "predict_with_generate": false,
+  "generation_max_length": null,
+  "generation_num_beams": null,
+  "generation_config": null,
+  "tuner_backend": "peft",
+  "vit_gradient_checkpointing": null,
+  "router_aux_loss_coef": 0.0,
+  "enable_dft_loss": false,
+  "enable_channel_loss": false,
+  "check_model": true,
+  "acc_strategy": "token",
+  "train_dataloader_shuffle": true,
+  "max_epochs": null,
+  "aligner_lr": null,
+  "vit_lr": null,
+  "use_logits_to_keep": null,
+  "ds3_gather_for_generation": true,
+  "resume_only_model": false,
+  "optimizer": null,
+  "loss_type": null,
+  "metric": null,
+  "eval_use_evalscope": false,
+  "eval_dataset": [],
+  "eval_dataset_args": null,
+  "eval_limit": null,
+  "eval_generation_config": null,
+  "extra_eval_args": null,
+  "use_flash_ckpt": false,
+  "use_ray": false,
+  "ray_exp_name": null,
+  "device_groups": null,
+  "model": "Qwen/Qwen3-8B",
+  "model_type": "qwen3",
+  "model_revision": null,
+  "task_type": "causal_lm",
+  "torch_dtype": "bfloat16",
+  "attn_impl": null,
+  "new_special_tokens": [],
+  "num_labels": null,
+  "problem_type": null,
+  "rope_scaling": null,
+  "device_map": null,
+  "max_memory": {},
+  "max_model_len": null,
+  "local_repo_path": null,
+  "init_strategy": null,
+  "template": "qwen3_thinking",
+  "system": "You are a creative screenwriter assistant specialized in dramatic storytelling with internal reasoning.",
+  "max_length": 8192,
+  "truncation_strategy": "delete",
+  "max_pixels": null,
+  "agent_template": null,
+  "norm_bbox": null,
+  "use_chat_template": true,
+  "padding_free": false,
+  "padding_side": "right",
+  "loss_scale": "ignore_empty_think",
+  "sequence_parallel_size": 1,
+  "response_prefix": null,
+  "template_backend": "swift",
+  "dataset": [
+    "~/drama_thinking_dataset_msswift.jsonl"
+  ],
+  "val_dataset": [],
+  "cached_dataset": [],
+  "cached_val_dataset": [],
+  "split_dataset_ratio": 0.0,
+  "dataset_num_proc": 1,
+  "load_from_cache_file": false,
+  "dataset_shuffle": true,
+  "val_dataset_shuffle": false,
+  "streaming": false,
+  "interleave_prob": null,
+  "stopping_strategy": "first_exhausted",
+  "shuffle_buffer_size": 1000,
+  "download_mode": "reuse_dataset_if_exists",
+  "columns": {},
+  "strict": false,
+  "model_name": [
+    "qwen3-drama-thinking"
+  ],
+  "model_author": [
+    "msj"
+  ],
+  "custom_dataset_info": [],
+  "quant_method": null,
+  "quant_bits": null,
+  "hqq_axis": null,
+  "bnb_4bit_compute_dtype": "bfloat16",
+  "bnb_4bit_quant_type": "nf4",
+  "bnb_4bit_use_double_quant": true,
+  "bnb_4bit_quant_storage": null,
+  "max_new_tokens": 64,
+  "temperature": 0.0,
+  "top_k": null,
+  "top_p": null,
+  "repetition_penalty": null,
+  "num_beams": 1,
+  "stream": false,
+  "stop_words": [],
+  "logprobs": false,
+  "top_logprobs": null,
+  "ckpt_dir": null,
+  "lora_modules": [],
+  "train_type": "full",
+  "adapters": [],
+  "external_plugins": [],
+  "model_kwargs": {},
+  "load_args": false,
+  "load_data_args": false,
+  "packing": false,
+  "packing_length": null,
+  "packing_num_proc": 1,
+  "lazy_tokenize": false,
+  "custom_register_path": [],
+  "use_hf": false,
+  "ignore_args_error": false,
+  "use_swift_lora": false,
+  "freeze_parameters": [],
+  "freeze_parameters_regex": null,
+  "freeze_parameters_ratio": 0.0,
+  "trainable_parameters": [],
+  "trainable_parameters_regex": null,
+  "freeze_llm": false,
+  "freeze_vit": true,
+  "freeze_aligner": true,
+  "target_modules": [
+    "all-linear"
+  ],
+  "target_regex": null,
+  "target_parameters": null,
+  "modules_to_save": [],
+  "lora_rank": 8,
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "lora_bias": "none",
+  "lora_dtype": null,
+  "lorap_lr_ratio": null,
+  "use_rslora": false,
+  "use_dora": false,
+  "lora_ga_batch_size": 2,
+  "lora_ga_iters": 2,
+  "lora_ga_max_length": 1024,
+  "lora_ga_direction": "ArB2r",
+  "lora_ga_scale": "stable",
+  "lora_ga_stable_gamma": 16,
+  "init_weights": true,
+  "fourier_n_frequency": 2000,
+  "fourier_scaling": 300.0,
+  "boft_block_size": 4,
+  "boft_block_num": 0,
+  "boft_n_butterfly_factor": 1,
+  "boft_dropout": 0.0,
+  "vera_rank": 256,
+  "vera_projection_prng_key": 0,
+  "vera_dropout": 0.0,
+  "vera_d_initial": 0.1,
+  "adapter_act": "gelu",
+  "adapter_length": 128,
+  "use_galore": false,
+  "galore_target_modules": null,
+  "galore_rank": 128,
+  "galore_update_proj_gap": 50,
+  "galore_scale": 1.0,
+  "galore_proj_type": "std",
+  "galore_optim_per_parameter": false,
+  "galore_with_embedding": false,
+  "galore_quantization": false,
+  "galore_proj_quant": false,
+  "galore_proj_bits": 4,
+  "galore_proj_group_size": 256,
+  "galore_cos_threshold": 0.4,
+  "galore_gamma_proj": 2,
+  "galore_queue_size": 5,
+  "adalora_target_r": 8,
+  "adalora_init_r": 12,
+  "adalora_tinit": 0,
+  "adalora_tfinal": 0,
+  "adalora_deltaT": 1,
+  "adalora_beta1": 0.85,
+  "adalora_beta2": 0.85,
+  "adalora_orth_reg_weight": 0.5,
+  "llamapro_num_new_blocks": 4,
+  "llamapro_num_groups": null,
+  "lisa_activated_layers": 0,
+  "lisa_step_interval": 20,
+  "reft_layer_key": null,
+  "reft_layers": null,
+  "reft_rank": 4,
+  "reft_intervention_type": "LoreftIntervention",
+  "reft_args": null,
+  "swanlab_token": null,
+  "swanlab_project": null,
+  "swanlab_workspace": null,
+  "swanlab_exp_name": null,
+  "swanlab_lark_webhook_url": null,
+  "swanlab_lark_secret": null,
+  "swanlab_mode": "cloud",
+  "add_version": true,
+  "create_checkpoint_symlink": false,
+  "zero_hpz_partition_size": null,
+  "deepspeed_autotp_size": null,
+  "early_stop_interval": null,
+  "rank": 0,
+  "global_world_size": 2,
+  "local_world_size": 2,
+  "model_suffix": "Qwen3-8B",
+  "model_info": "ModelInfo(model_type='qwen3', model_dir='/home/ubuntu/.cache/modelscope/hub/models/Qwen/Qwen3-8B', torch_dtype=torch.bfloat16, max_model_len=40960, quant_method=None, quant_bits=None, rope_scaling=None, is_moe_model=False, is_multimodal=False, config=None, task_type='causal_lm', num_labels=None)",
+  "model_meta": "ModelMeta(model_type='qwen3', model_groups=[ModelGroup(models=[Model(ms_model_id='Qwen/Qwen3-0.6B-Base', hf_model_id='Qwen/Qwen3-0.6B-Base', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-1.7B-Base', hf_model_id='Qwen/Qwen3-1.7B-Base', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-4B-Base', hf_model_id='Qwen/Qwen3-4B-Base', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-8B-Base', hf_model_id='Qwen/Qwen3-8B-Base', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-14B-Base', hf_model_id='Qwen/Qwen3-14B-Base', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-0.6B', hf_model_id='Qwen/Qwen3-0.6B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-1.7B', hf_model_id='Qwen/Qwen3-1.7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-4B', hf_model_id='Qwen/Qwen3-4B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-8B', hf_model_id='Qwen/Qwen3-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-14B', hf_model_id='Qwen/Qwen3-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-32B', hf_model_id='Qwen/Qwen3-32B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-0.6B-FP8', hf_model_id='Qwen/Qwen3-0.6B-FP8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-1.7B-FP8', hf_model_id='Qwen/Qwen3-1.7B-FP8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-4B-FP8', hf_model_id='Qwen/Qwen3-4B-FP8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-8B-FP8', hf_model_id='Qwen/Qwen3-8B-FP8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-14B-FP8', hf_model_id='Qwen/Qwen3-14B-FP8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-32B-FP8', hf_model_id='Qwen/Qwen3-32B-FP8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-4B-AWQ', hf_model_id='Qwen/Qwen3-4B-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-8B-AWQ', hf_model_id='Qwen/Qwen3-8B-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-14B-AWQ', hf_model_id='Qwen/Qwen3-14B-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-32B-AWQ', hf_model_id='Qwen/Qwen3-32B-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='swift/Qwen3-32B-AWQ', hf_model_id=None, model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='qwen3', get_function=<function get_model_tokenizer_with_flash_attn at 0x70cb4f851630>, model_arch=ModelKeys(arch_name='llama', embedding='model.embed_tokens', module_list='model.layers', lm_head='lm_head', q_proj='model.layers.{}.self_attn.q_proj', k_proj='model.layers.{}.self_attn.k_proj', v_proj='model.layers.{}.self_attn.v_proj', o_proj='model.layers.{}.self_attn.o_proj', attention='model.layers.{}.self_attn', mlp='model.layers.{}.mlp', down_proj='model.layers.{}.mlp.down_proj', qkv_proj=None, qk_proj=None, qa_proj=None, qb_proj=None, kv_proj=None, kva_proj=None, kvb_proj=None), architectures=['Qwen3ForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, is_reranker=False, task_type=None, ignore_patterns=None, requires=['transformers>=4.51'], tags=[])",
+  "model_dir": "/home/ubuntu/.cache/modelscope/hub/models/Qwen/Qwen3-8B",
+  "_val_dataset_exists": [],
+  "hub": "<class 'swift.hub.hub.MSHub'>",
+  "evaluation_strategy": "steps",
+  "training_args": "Seq2SeqTrainingArguments(output_dir='/home/ubuntu/output/qwen3-8b-drama-thinking-full/v2-20251208-055020', overwrite_output_dir=False, do_train=False, do_eval=False, do_predict=False, eval_strategy=<IntervalStrategy.NO: 'no'>, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=8, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=1e-05, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=<SchedulerType.COSINE: 'cosine'>, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/ubuntu/output/qwen3-8b-drama-thinking-full/logs', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=<SaveStrategy.STEPS: 'steps'>, save_steps=100, save_total_limit=3, save_safetensors=True, save_on_each_node=False, save_only_model=True, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=50.0, dataloader_num_workers=4, dataloader_prefetch_factor=10, past_index=-1, run_name='/home/ubuntu/output/qwen3-8b-drama-thinking-full/v2-20251208-055020', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), parallelism_config=None, deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': False, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'zero_quantized_weights': False, 'zero_quantized_gradients': False, 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=<OptimizerNames.ADAMW_TORCH_FUSED: 'adamw_torch_fused'>, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], project='huggingface', trackio_space_id='trackio', ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>, hub_token=None, hub_private_repo=None, hub_always_push=False, hub_revision=None, gradient_checkpointing=True, gradient_checkpointing_kwargs={'use_reentrant': False}, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=18000000, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=True, liger_kernel_config=None, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, tuner_backend='peft', vit_gradient_checkpointing=True, router_aux_loss_coef=0.0, enable_dft_loss=False, enable_channel_loss=False, check_model=True, acc_strategy='token', train_dataloader_shuffle=True, max_epochs=None, aligner_lr=None, vit_lr=None, use_logits_to_keep=None, ds3_gather_for_generation=True, resume_only_model=False, optimizer=None, loss_type=None, metric=None, eval_use_evalscope=False, eval_dataset=[], eval_dataset_args=None, eval_limit=None, eval_generation_config=None, extra_eval_args=None, use_flash_ckpt=False, sft_alpha=0, chord_sft_dataset=[], chord_sft_per_device_train_batch_size=None, chord_enable_phi_function=False, chord_mu_warmup_steps=None, chord_mu_decay_steps=None, chord_mu_peak=None, chord_mu_valley=None, train_type='full', local_repo_path=None, galore_config=None, padding_side='right', padding_free=False, task_type='causal_lm', problem_type=None)"
+}
\ No newline at end of file
diff --git a/chat_template.jinja b/chat_template.jinja
new file mode 100644
index 0000000..01be9b3
--- /dev/null
+++ b/chat_template.jinja
@@ -0,0 +1,89 @@
+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0].role == 'system' %}
+        {{- messages[0].content + '\n\n' }}
+    {%- endif %}
+    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0].role == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+    {%- set index = (messages|length - 1) - loop.index0 %}
+    {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
+        {%- set ns.multi_step_tool = false %}
+        {%- set ns.last_query_index = index %}
+    {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+    {%- if message.content is string %}
+        {%- set content = message.content %}
+    {%- else %}
+        {%- set content = '' %}
+    {%- endif %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+        {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {%- set reasoning_content = '' %}
+        {%- if message.reasoning_content is string %}
+            {%- set reasoning_content = message.reasoning_content %}
+        {%- else %}
+            {%- if '</think>' in content %}
+                {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
+                {%- set content = content.split('</think>')[-1].lstrip('\n') %}
+            {%- endif %}
+        {%- endif %}
+        {%- if loop.index0 > ns.last_query_index %}
+            {%- if loop.last or (not loop.last and reasoning_content) %}
+                {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
+            {%- else %}
+                {{- '<|im_start|>' + message.role + '\n' + content }}
+            {%- endif %}
+        {%- else %}
+            {{- '<|im_start|>' + message.role + '\n' + content }}
+        {%- endif %}
+        {%- if message.tool_calls %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if (loop.first and content) or (not loop.first) %}
+                    {{- '\n' }}
+                {%- endif %}
+                {%- if tool_call.function %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {{- '<tool_call>\n{"name": "' }}
+                {{- tool_call.name }}
+                {{- '", "arguments": ' }}
+                {%- if tool_call.arguments is string %}
+                    {{- tool_call.arguments }}
+                {%- else %}
+                    {{- tool_call.arguments | tojson }}
+                {%- endif %}
+                {{- '}\n</tool_call>' }}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+    {%- if enable_thinking is defined and enable_thinking is false %}
+        {{- '<think>\n\n</think>\n\n' }}
+    {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/config.json b/config.json
new file mode 100644
index 0000000..f8a98d4
--- /dev/null
+++ b/config.json
@@ -0,0 +1,68 @@
+{
+  "architectures": [
+    "Qwen3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "dtype": "bfloat16",
+  "eos_token_id": 151645,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 12288,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 40960,
+  "max_window_layers": 36,
+  "model_type": "qwen3",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 36,
+  "num_key_value_heads": 8,
+  "pad_token_id": 151643,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "transformers_version": "4.57.3",
+  "use_cache": false,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}
diff --git a/configuration.json b/configuration.json
new file mode 100644
index 0000000..bbeeda1
--- /dev/null
+++ b/configuration.json
@@ -0,0 +1 @@
+{"framework": "pytorch", "task": "text-generation", "allow_remote": true}
\ No newline at end of file
diff --git a/generation_config.json b/generation_config.json
new file mode 100644
index 0000000..98e0755
--- /dev/null
+++ b/generation_config.json
@@ -0,0 +1,13 @@
+{
+  "bos_token_id": 151643,
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "temperature": 0.6,
+  "top_k": 20,
+  "top_p": 0.95,
+  "transformers_version": "4.57.3"
+}
diff --git a/merges.txt b/merges.txt
new file mode 100644
index 0000000..80c1a19
--- /dev/null
+++ b/merges.txt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8831e4f1a044471340f7c0a83d7bd71306a5b867e95fd870f74d0c5308a904d5
+size 1671853
diff --git a/model-00001-of-00004.safetensors b/model-00001-of-00004.safetensors
new file mode 100644
index 0000000..975d9ed
--- /dev/null
+++ b/model-00001-of-00004.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5f60e26ff9187a86191977fe255865508da9ba5d8f84c7181b76673a38c32fa9
+size 4902257696
diff --git a/model-00002-of-00004.safetensors b/model-00002-of-00004.safetensors
new file mode 100644
index 0000000..e72c61e
--- /dev/null
+++ b/model-00002-of-00004.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:372cc2f41d3805d0976cb2c4235ffced77133720f0cef420eacb1d3f975e9c13
+size 4915960368
diff --git a/model-00003-of-00004.safetensors b/model-00003-of-00004.safetensors
new file mode 100644
index 0000000..dc7db05
--- /dev/null
+++ b/model-00003-of-00004.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c5d4596e6ad0c9c053e7e3a7f9efb1b55c1d48c9fea8a3719beb328af0df58b2
+size 4983068496
diff --git a/model-00004-of-00004.safetensors b/model-00004-of-00004.safetensors
new file mode 100644
index 0000000..2214591
--- /dev/null
+++ b/model-00004-of-00004.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3acb42d50dbac0844e4f237dfe285ef618b93468fbe3d541c7f65d9dbc72ae5a
+size 1580230264
diff --git a/model.safetensors.index.json b/model.safetensors.index.json
new file mode 100644
index 0000000..ba886c0
--- /dev/null
+++ b/model.safetensors.index.json
@@ -0,0 +1,407 @@
+{
+  "metadata": {
+    "total_parameters": 308224,
+    "total_size": 16381470720
+  },
+  "weight_map": {
+    "lm_head.weight": "model-00004-of-00004.safetensors",
+    "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.20.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.20.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.20.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.21.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.21.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.21.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.22.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.32.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.32.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.32.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.32.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.32.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.32.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.32.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.32.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.32.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.32.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.32.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.33.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.33.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.33.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.33.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.33.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.33.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.33.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.33.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.33.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.33.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.33.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.34.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.34.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.34.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.34.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.34.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.34.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.34.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.34.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.34.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.34.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.34.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.35.input_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.35.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.35.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.35.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.35.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.35.self_attn.k_norm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.35.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.35.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.35.self_attn.q_norm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.35.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.35.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.9.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.norm.weight": "model-00004-of-00004.safetensors"
+  }
+}
diff --git a/special_tokens_map.json b/special_tokens_map.json
new file mode 100644
index 0000000..ac23c0a
--- /dev/null
+++ b/special_tokens_map.json
@@ -0,0 +1,31 @@
+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/tokenizer.json b/tokenizer.json
new file mode 100644
index 0000000..cd71f61
--- /dev/null
+++ b/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4
+size 11422654
diff --git a/tokenizer_config.json b/tokenizer_config.json
new file mode 100644
index 0000000..ddaf698
--- /dev/null
+++ b/tokenizer_config.json
@@ -0,0 +1,239 @@
+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "<tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151666": {
+      "content": "</tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151667": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151668": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}
diff --git a/trainer_state.json b/trainer_state.json
new file mode 100644
index 0000000..36235ba
--- /dev/null
+++ b/trainer_state.json
@@ -0,0 +1,1700 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.0,
+  "eval_steps": 50.0,
+  "global_step": 1185,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0025324469768914213,
+      "grad_norm": 6.541903357166615,
+      "learning_rate": 1.6666666666666668e-07,
+      "loss": 1.6022449731826782,
+      "step": 1
+    },
+    {
+      "epoch": 0.012662234884457106,
+      "grad_norm": 6.812644002932686,
+      "learning_rate": 8.333333333333333e-07,
+      "loss": 1.5844556093215942,
+      "step": 5
+    },
+    {
+      "epoch": 0.025324469768914212,
+      "grad_norm": 5.463309643291768,
+      "learning_rate": 1.6666666666666667e-06,
+      "loss": 1.5758234024047852,
+      "step": 10
+    },
+    {
+      "epoch": 0.03798670465337132,
+      "grad_norm": 3.5947186062437324,
+      "learning_rate": 2.5e-06,
+      "loss": 1.5148856163024902,
+      "step": 15
+    },
+    {
+      "epoch": 0.050648939537828425,
+      "grad_norm": 2.022183241828244,
+      "learning_rate": 3.3333333333333333e-06,
+      "loss": 1.4563226699829102,
+      "step": 20
+    },
+    {
+      "epoch": 0.06331117442228554,
+      "grad_norm": 2.568206156447999,
+      "learning_rate": 4.166666666666667e-06,
+      "loss": 1.4071508407592774,
+      "step": 25
+    },
+    {
+      "epoch": 0.07597340930674264,
+      "grad_norm": 1.3172558883152181,
+      "learning_rate": 5e-06,
+      "loss": 1.361931037902832,
+      "step": 30
+    },
+    {
+      "epoch": 0.08863564419119975,
+      "grad_norm": 1.289611427468001,
+      "learning_rate": 5.833333333333334e-06,
+      "loss": 1.3146369934082032,
+      "step": 35
+    },
+    {
+      "epoch": 0.10129787907565685,
+      "grad_norm": 1.0096819520657572,
+      "learning_rate": 6.666666666666667e-06,
+      "loss": 1.2890718460083008,
+      "step": 40
+    },
+    {
+      "epoch": 0.11396011396011396,
+      "grad_norm": 0.9312368002064222,
+      "learning_rate": 7.500000000000001e-06,
+      "loss": 1.262472152709961,
+      "step": 45
+    },
+    {
+      "epoch": 0.12662234884457108,
+      "grad_norm": 0.9372677018897868,
+      "learning_rate": 8.333333333333334e-06,
+      "loss": 1.2449541091918945,
+      "step": 50
+    },
+    {
+      "epoch": 0.13928458372902816,
+      "grad_norm": 1.156336182386965,
+      "learning_rate": 9.166666666666666e-06,
+      "loss": 1.23805570602417,
+      "step": 55
+    },
+    {
+      "epoch": 0.15194681861348527,
+      "grad_norm": 1.2881982404625736,
+      "learning_rate": 1e-05,
+      "loss": 1.2132294654846192,
+      "step": 60
+    },
+    {
+      "epoch": 0.1646090534979424,
+      "grad_norm": 1.0254220781070695,
+      "learning_rate": 9.999512620046523e-06,
+      "loss": 1.220973587036133,
+      "step": 65
+    },
+    {
+      "epoch": 0.1772712883823995,
+      "grad_norm": 0.9489950684909466,
+      "learning_rate": 9.998050575201772e-06,
+      "loss": 1.2019853591918945,
+      "step": 70
+    },
+    {
+      "epoch": 0.1899335232668566,
+      "grad_norm": 0.9316575478826806,
+      "learning_rate": 9.995614150494293e-06,
+      "loss": 1.2073640823364258,
+      "step": 75
+    },
+    {
+      "epoch": 0.2025957581513137,
+      "grad_norm": 1.0042469325935621,
+      "learning_rate": 9.992203820909906e-06,
+      "loss": 1.1844447135925293,
+      "step": 80
+    },
+    {
+      "epoch": 0.2152579930357708,
+      "grad_norm": 0.9710381702713043,
+      "learning_rate": 9.987820251299121e-06,
+      "loss": 1.1868626594543457,
+      "step": 85
+    },
+    {
+      "epoch": 0.22792022792022792,
+      "grad_norm": 0.9015713419278082,
+      "learning_rate": 9.982464296247523e-06,
+      "loss": 1.16792631149292,
+      "step": 90
+    },
+    {
+      "epoch": 0.24058246280468504,
+      "grad_norm": 0.9242205484551671,
+      "learning_rate": 9.976136999909156e-06,
+      "loss": 1.1806648254394532,
+      "step": 95
+    },
+    {
+      "epoch": 0.25324469768914215,
+      "grad_norm": 0.8421714973436404,
+      "learning_rate": 9.968839595802982e-06,
+      "loss": 1.1688653945922851,
+      "step": 100
+    },
+    {
+      "epoch": 0.26590693257359926,
+      "grad_norm": 0.9053511703432988,
+      "learning_rate": 9.960573506572391e-06,
+      "loss": 1.1603254318237304,
+      "step": 105
+    },
+    {
+      "epoch": 0.2785691674580563,
+      "grad_norm": 0.8815755237366663,
+      "learning_rate": 9.951340343707852e-06,
+      "loss": 1.1436431884765625,
+      "step": 110
+    },
+    {
+      "epoch": 0.29123140234251343,
+      "grad_norm": 0.9133167544949871,
+      "learning_rate": 9.941141907232766e-06,
+      "loss": 1.1711238861083983,
+      "step": 115
+    },
+    {
+      "epoch": 0.30389363722697055,
+      "grad_norm": 0.9280708661664501,
+      "learning_rate": 9.929980185352525e-06,
+      "loss": 1.1607641220092773,
+      "step": 120
+    },
+    {
+      "epoch": 0.31655587211142766,
+      "grad_norm": 0.8789051540869617,
+      "learning_rate": 9.91785735406693e-06,
+      "loss": 1.1655372619628905,
+      "step": 125
+    },
+    {
+      "epoch": 0.3292181069958848,
+      "grad_norm": 0.9387606971380588,
+      "learning_rate": 9.904775776745959e-06,
+      "loss": 1.1415754318237306,
+      "step": 130
+    },
+    {
+      "epoch": 0.3418803418803419,
+      "grad_norm": 0.8962535961715238,
+      "learning_rate": 9.890738003669029e-06,
+      "loss": 1.141004753112793,
+      "step": 135
+    },
+    {
+      "epoch": 0.354542576764799,
+      "grad_norm": 0.8628618510513137,
+      "learning_rate": 9.875746771527817e-06,
+      "loss": 1.1703954696655274,
+      "step": 140
+    },
+    {
+      "epoch": 0.3672048116492561,
+      "grad_norm": 0.901181222022341,
+      "learning_rate": 9.859805002892733e-06,
+      "loss": 1.1528019905090332,
+      "step": 145
+    },
+    {
+      "epoch": 0.3798670465337132,
+      "grad_norm": 0.8630918009712893,
+      "learning_rate": 9.842915805643156e-06,
+      "loss": 1.1367189407348632,
+      "step": 150
+    },
+    {
+      "epoch": 0.3925292814181703,
+      "grad_norm": 0.8702195806012554,
+      "learning_rate": 9.825082472361558e-06,
+      "loss": 1.1533798217773437,
+      "step": 155
+    },
+    {
+      "epoch": 0.4051915163026274,
+      "grad_norm": 0.8708614692916694,
+      "learning_rate": 9.806308479691595e-06,
+      "loss": 1.158640480041504,
+      "step": 160
+    },
+    {
+      "epoch": 0.4178537511870845,
+      "grad_norm": 0.8793848769376316,
+      "learning_rate": 9.786597487660336e-06,
+      "loss": 1.1480545043945312,
+      "step": 165
+    },
+    {
+      "epoch": 0.4305159860715416,
+      "grad_norm": 0.8658001687150836,
+      "learning_rate": 9.765953338964736e-06,
+      "loss": 1.1336278915405273,
+      "step": 170
+    },
+    {
+      "epoch": 0.44317822095599874,
+      "grad_norm": 0.8569493052828222,
+      "learning_rate": 9.744380058222483e-06,
+      "loss": 1.1366922378540039,
+      "step": 175
+    },
+    {
+      "epoch": 0.45584045584045585,
+      "grad_norm": 0.8658238768368638,
+      "learning_rate": 9.721881851187406e-06,
+      "loss": 1.1221330642700196,
+      "step": 180
+    },
+    {
+      "epoch": 0.46850269072491296,
+      "grad_norm": 0.8315025062463812,
+      "learning_rate": 9.698463103929542e-06,
+      "loss": 1.137201690673828,
+      "step": 185
+    },
+    {
+      "epoch": 0.4811649256093701,
+      "grad_norm": 0.8646733066379476,
+      "learning_rate": 9.674128381980073e-06,
+      "loss": 1.1246437072753905,
+      "step": 190
+    },
+    {
+      "epoch": 0.49382716049382713,
+      "grad_norm": 0.9329613102085004,
+      "learning_rate": 9.648882429441258e-06,
+      "loss": 1.1196226119995116,
+      "step": 195
+    },
+    {
+      "epoch": 0.5064893953782843,
+      "grad_norm": 0.8893896484251661,
+      "learning_rate": 9.622730168061568e-06,
+      "loss": 1.1334550857543946,
+      "step": 200
+    },
+    {
+      "epoch": 0.5191516302627414,
+      "grad_norm": 0.912333387639604,
+      "learning_rate": 9.595676696276173e-06,
+      "loss": 1.1253994941711425,
+      "step": 205
+    },
+    {
+      "epoch": 0.5318138651471985,
+      "grad_norm": 0.982396926968246,
+      "learning_rate": 9.567727288213005e-06,
+      "loss": 1.1222535133361817,
+      "step": 210
+    },
+    {
+      "epoch": 0.5444761000316556,
+      "grad_norm": 0.885141191565451,
+      "learning_rate": 9.538887392664544e-06,
+      "loss": 1.1143704414367677,
+      "step": 215
+    },
+    {
+      "epoch": 0.5571383349161126,
+      "grad_norm": 0.840306231211871,
+      "learning_rate": 9.50916263202557e-06,
+      "loss": 1.1145578384399415,
+      "step": 220
+    },
+    {
+      "epoch": 0.5698005698005698,
+      "grad_norm": 0.873418768799577,
+      "learning_rate": 9.478558801197065e-06,
+      "loss": 1.1184951782226562,
+      "step": 225
+    },
+    {
+      "epoch": 0.5824628046850269,
+      "grad_norm": 0.8644731775393019,
+      "learning_rate": 9.44708186645649e-06,
+      "loss": 1.1118096351623534,
+      "step": 230
+    },
+    {
+      "epoch": 0.595125039569484,
+      "grad_norm": 0.8383543019686877,
+      "learning_rate": 9.414737964294636e-06,
+      "loss": 1.1120855331420898,
+      "step": 235
+    },
+    {
+      "epoch": 0.6077872744539411,
+      "grad_norm": 0.8339381439594867,
+      "learning_rate": 9.381533400219319e-06,
+      "loss": 1.0976166725158691,
+      "step": 240
+    },
+    {
+      "epoch": 0.6204495093383983,
+      "grad_norm": 1.2527861157729694,
+      "learning_rate": 9.347474647526095e-06,
+      "loss": 1.1195283889770509,
+      "step": 245
+    },
+    {
+      "epoch": 0.6331117442228553,
+      "grad_norm": 0.896892265554505,
+      "learning_rate": 9.312568346036288e-06,
+      "loss": 1.1280832290649414,
+      "step": 250
+    },
+    {
+      "epoch": 0.6457739791073125,
+      "grad_norm": 0.863217024084411,
+      "learning_rate": 9.276821300802535e-06,
+      "loss": 1.1169985771179198,
+      "step": 255
+    },
+    {
+      "epoch": 0.6584362139917695,
+      "grad_norm": 0.8613522109819245,
+      "learning_rate": 9.24024048078213e-06,
+      "loss": 1.110457420349121,
+      "step": 260
+    },
+    {
+      "epoch": 0.6710984488762266,
+      "grad_norm": 0.8269568651408957,
+      "learning_rate": 9.202833017478421e-06,
+      "loss": 1.1079233169555665,
+      "step": 265
+    },
+    {
+      "epoch": 0.6837606837606838,
+      "grad_norm": 0.9106153459573166,
+      "learning_rate": 9.164606203550498e-06,
+      "loss": 1.115132713317871,
+      "step": 270
+    },
+    {
+      "epoch": 0.6964229186451408,
+      "grad_norm": 0.8475270076896408,
+      "learning_rate": 9.125567491391476e-06,
+      "loss": 1.114927101135254,
+      "step": 275
+    },
+    {
+      "epoch": 0.709085153529598,
+      "grad_norm": 0.8419303390301319,
+      "learning_rate": 9.085724491675642e-06,
+      "loss": 1.1053291320800782,
+      "step": 280
+    },
+    {
+      "epoch": 0.7217473884140551,
+      "grad_norm": 0.8793202963091465,
+      "learning_rate": 9.045084971874738e-06,
+      "loss": 1.1043977737426758,
+      "step": 285
+    },
+    {
+      "epoch": 0.7344096232985122,
+      "grad_norm": 0.8844837887961337,
+      "learning_rate": 9.003656854743667e-06,
+      "loss": 1.0930152893066407,
+      "step": 290
+    },
+    {
+      "epoch": 0.7470718581829693,
+      "grad_norm": 0.8243068254935355,
+      "learning_rate": 8.961448216775955e-06,
+      "loss": 1.1083423614501953,
+      "step": 295
+    },
+    {
+      "epoch": 0.7597340930674265,
+      "grad_norm": 0.8231480643363308,
+      "learning_rate": 8.9184672866292e-06,
+      "loss": 1.093316650390625,
+      "step": 300
+    },
+    {
+      "epoch": 0.7723963279518835,
+      "grad_norm": 0.856487105562053,
+      "learning_rate": 8.874722443520898e-06,
+      "loss": 1.0935728073120117,
+      "step": 305
+    },
+    {
+      "epoch": 0.7850585628363406,
+      "grad_norm": 0.9069069891533103,
+      "learning_rate": 8.83022221559489e-06,
+      "loss": 1.085923957824707,
+      "step": 310
+    },
+    {
+      "epoch": 0.7977207977207977,
+      "grad_norm": 0.8415924258567633,
+      "learning_rate": 8.784975278258783e-06,
+      "loss": 1.1055352210998535,
+      "step": 315
+    },
+    {
+      "epoch": 0.8103830326052548,
+      "grad_norm": 0.8547842295217172,
+      "learning_rate": 8.73899045249266e-06,
+      "loss": 1.1053098678588866,
+      "step": 320
+    },
+    {
+      "epoch": 0.823045267489712,
+      "grad_norm": 0.9042040663099864,
+      "learning_rate": 8.692276703129421e-06,
+      "loss": 1.100543212890625,
+      "step": 325
+    },
+    {
+      "epoch": 0.835707502374169,
+      "grad_norm": 0.840156677605529,
+      "learning_rate": 8.644843137107058e-06,
+      "loss": 1.1007650375366211,
+      "step": 330
+    },
+    {
+      "epoch": 0.8483697372586262,
+      "grad_norm": 0.8554168041829401,
+      "learning_rate": 8.596699001693257e-06,
+      "loss": 1.095210647583008,
+      "step": 335
+    },
+    {
+      "epoch": 0.8610319721430832,
+      "grad_norm": 0.8378136162576828,
+      "learning_rate": 8.547853682682605e-06,
+      "loss": 1.0945035934448242,
+      "step": 340
+    },
+    {
+      "epoch": 0.8736942070275404,
+      "grad_norm": 0.8300982370825878,
+      "learning_rate": 8.498316702566828e-06,
+      "loss": 1.0824993133544922,
+      "step": 345
+    },
+    {
+      "epoch": 0.8863564419119975,
+      "grad_norm": 0.8879949006435145,
+      "learning_rate": 8.44809771867835e-06,
+      "loss": 1.0910042762756347,
+      "step": 350
+    },
+    {
+      "epoch": 0.8990186767964545,
+      "grad_norm": 0.8363110809635331,
+      "learning_rate": 8.397206521307584e-06,
+      "loss": 1.085635280609131,
+      "step": 355
+    },
+    {
+      "epoch": 0.9116809116809117,
+      "grad_norm": 0.8250978511317656,
+      "learning_rate": 8.345653031794292e-06,
+      "loss": 1.0832603454589844,
+      "step": 360
+    },
+    {
+      "epoch": 0.9243431465653688,
+      "grad_norm": 0.8250625494950978,
+      "learning_rate": 8.293447300593402e-06,
+      "loss": 1.0881545066833496,
+      "step": 365
+    },
+    {
+      "epoch": 0.9370053814498259,
+      "grad_norm": 0.9637417812174898,
+      "learning_rate": 8.240599505315656e-06,
+      "loss": 1.077590274810791,
+      "step": 370
+    },
+    {
+      "epoch": 0.949667616334283,
+      "grad_norm": 0.938188486575515,
+      "learning_rate": 8.18711994874345e-06,
+      "loss": 1.0923616409301757,
+      "step": 375
+    },
+    {
+      "epoch": 0.9623298512187402,
+      "grad_norm": 0.829053167214024,
+      "learning_rate": 8.133019056822303e-06,
+      "loss": 1.0790325164794923,
+      "step": 380
+    },
+    {
+      "epoch": 0.9749920861031972,
+      "grad_norm": 0.8296874845053457,
+      "learning_rate": 8.078307376628292e-06,
+      "loss": 1.0690267562866211,
+      "step": 385
+    },
+    {
+      "epoch": 0.9876543209876543,
+      "grad_norm": 0.8248755231512207,
+      "learning_rate": 8.022995574311876e-06,
+      "loss": 1.0922147750854492,
+      "step": 390
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.9123714875418006,
+      "learning_rate": 7.967094433018508e-06,
+      "loss": 1.0716293334960938,
+      "step": 395
+    },
+    {
+      "epoch": 1.0126622348844572,
+      "grad_norm": 0.8825626316822892,
+      "learning_rate": 7.910614850786448e-06,
+      "loss": 0.9421855926513671,
+      "step": 400
+    },
+    {
+      "epoch": 1.0253244697689141,
+      "grad_norm": 0.981129259243819,
+      "learning_rate": 7.85356783842216e-06,
+      "loss": 0.9680027008056641,
+      "step": 405
+    },
+    {
+      "epoch": 1.0379867046533713,
+      "grad_norm": 0.9490494582638624,
+      "learning_rate": 7.795964517353734e-06,
+      "loss": 0.9392026901245117,
+      "step": 410
+    },
+    {
+      "epoch": 1.0506489395378285,
+      "grad_norm": 1.0436527309713077,
+      "learning_rate": 7.737816117462752e-06,
+      "loss": 0.9481110572814941,
+      "step": 415
+    },
+    {
+      "epoch": 1.0633111744222856,
+      "grad_norm": 0.9193717140597131,
+      "learning_rate": 7.679133974894984e-06,
+      "loss": 0.9479268074035645,
+      "step": 420
+    },
+    {
+      "epoch": 1.0759734093067426,
+      "grad_norm": 0.9176846478769476,
+      "learning_rate": 7.619929529850397e-06,
+      "loss": 0.9510162353515625,
+      "step": 425
+    },
+    {
+      "epoch": 1.0886356441911997,
+      "grad_norm": 0.9263690784461404,
+      "learning_rate": 7.560214324352858e-06,
+      "loss": 0.9560428619384765,
+      "step": 430
+    },
+    {
+      "epoch": 1.101297879075657,
+      "grad_norm": 0.8985018721390384,
+      "learning_rate": 7.500000000000001e-06,
+      "loss": 0.9549171447753906,
+      "step": 435
+    },
+    {
+      "epoch": 1.1139601139601139,
+      "grad_norm": 0.8383045792822509,
+      "learning_rate": 7.4392982956936644e-06,
+      "loss": 0.9572299957275391,
+      "step": 440
+    },
+    {
+      "epoch": 1.126622348844571,
+      "grad_norm": 0.8693402459631241,
+      "learning_rate": 7.378121045351378e-06,
+      "loss": 0.9538370132446289,
+      "step": 445
+    },
+    {
+      "epoch": 1.1392845837290282,
+      "grad_norm": 0.8465948151936904,
+      "learning_rate": 7.31648017559931e-06,
+      "loss": 0.9445423126220703,
+      "step": 450
+    },
+    {
+      "epoch": 1.1519468186134854,
+      "grad_norm": 0.8993258971886791,
+      "learning_rate": 7.254387703447154e-06,
+      "loss": 0.9402847290039062,
+      "step": 455
+    },
+    {
+      "epoch": 1.1646090534979423,
+      "grad_norm": 0.8973654441260622,
+      "learning_rate": 7.191855733945388e-06,
+      "loss": 0.9458431243896485,
+      "step": 460
+    },
+    {
+      "epoch": 1.1772712883823995,
+      "grad_norm": 0.8975789539843146,
+      "learning_rate": 7.128896457825364e-06,
+      "loss": 0.9456979751586914,
+      "step": 465
+    },
+    {
+      "epoch": 1.1899335232668566,
+      "grad_norm": 0.9025883974896288,
+      "learning_rate": 7.06552214912271e-06,
+      "loss": 0.958702278137207,
+      "step": 470
+    },
+    {
+      "epoch": 1.2025957581513138,
+      "grad_norm": 0.8943619241590697,
+      "learning_rate": 7.0017451627844765e-06,
+      "loss": 0.9409778594970704,
+      "step": 475
+    },
+    {
+      "epoch": 1.2152579930357708,
+      "grad_norm": 0.8987697465779751,
+      "learning_rate": 6.9375779322605154e-06,
+      "loss": 0.952575397491455,
+      "step": 480
+    },
+    {
+      "epoch": 1.227920227920228,
+      "grad_norm": 0.8957262384243423,
+      "learning_rate": 6.873032967079562e-06,
+      "loss": 0.9412460327148438,
+      "step": 485
+    },
+    {
+      "epoch": 1.240582462804685,
+      "grad_norm": 0.9191287064439484,
+      "learning_rate": 6.808122850410461e-06,
+      "loss": 0.9442897796630859,
+      "step": 490
+    },
+    {
+      "epoch": 1.253244697689142,
+      "grad_norm": 0.9120111224616239,
+      "learning_rate": 6.7428602366090764e-06,
+      "loss": 0.9721967697143554,
+      "step": 495
+    },
+    {
+      "epoch": 1.2659069325735992,
+      "grad_norm": 0.9297557344562997,
+      "learning_rate": 6.677257848751276e-06,
+      "loss": 0.9427990913391113,
+      "step": 500
+    },
+    {
+      "epoch": 1.2785691674580564,
+      "grad_norm": 0.9256360350131605,
+      "learning_rate": 6.611328476152557e-06,
+      "loss": 0.9448193550109864,
+      "step": 505
+    },
+    {
+      "epoch": 1.2912314023425133,
+      "grad_norm": 0.9178166712574457,
+      "learning_rate": 6.545084971874738e-06,
+      "loss": 0.9285225868225098,
+      "step": 510
+    },
+    {
+      "epoch": 1.3038936372269705,
+      "grad_norm": 0.8824737418151191,
+      "learning_rate": 6.4785402502202345e-06,
+      "loss": 0.9465466499328613,
+      "step": 515
+    },
+    {
+      "epoch": 1.3165558721114277,
+      "grad_norm": 0.8714305178817582,
+      "learning_rate": 6.411707284214384e-06,
+      "loss": 0.9558137893676758,
+      "step": 520
+    },
+    {
+      "epoch": 1.3292181069958848,
+      "grad_norm": 1.6420471551581535,
+      "learning_rate": 6.344599103076329e-06,
+      "loss": 0.9441043853759765,
+      "step": 525
+    },
+    {
+      "epoch": 1.341880341880342,
+      "grad_norm": 0.8940534993249484,
+      "learning_rate": 6.277228789678953e-06,
+      "loss": 0.9406339645385742,
+      "step": 530
+    },
+    {
+      "epoch": 1.354542576764799,
+      "grad_norm": 0.8657105103377609,
+      "learning_rate": 6.209609477998339e-06,
+      "loss": 0.9400988578796386,
+      "step": 535
+    },
+    {
+      "epoch": 1.3672048116492561,
+      "grad_norm": 0.8795303497602281,
+      "learning_rate": 6.141754350553279e-06,
+      "loss": 0.9375904083251954,
+      "step": 540
+    },
+    {
+      "epoch": 1.3798670465337133,
+      "grad_norm": 0.8778881000839949,
+      "learning_rate": 6.073676635835317e-06,
+      "loss": 0.9534420013427735,
+      "step": 545
+    },
+    {
+      "epoch": 1.3925292814181702,
+      "grad_norm": 0.8609329406866304,
+      "learning_rate": 6.005389605729824e-06,
+      "loss": 0.9435734748840332,
+      "step": 550
+    },
+    {
+      "epoch": 1.4051915163026274,
+      "grad_norm": 0.901450340070586,
+      "learning_rate": 5.936906572928625e-06,
+      "loss": 0.9454706192016602,
+      "step": 555
+    },
+    {
+      "epoch": 1.4178537511870846,
+      "grad_norm": 0.9056724009579911,
+      "learning_rate": 5.8682408883346535e-06,
+      "loss": 0.9358626365661621,
+      "step": 560
+    },
+    {
+      "epoch": 1.4305159860715415,
+      "grad_norm": 0.8767791922734569,
+      "learning_rate": 5.799405938459175e-06,
+      "loss": 0.9384665489196777,
+      "step": 565
+    },
+    {
+      "epoch": 1.4431782209559987,
+      "grad_norm": 0.9226108292554362,
+      "learning_rate": 5.730415142812059e-06,
+      "loss": 0.9389400482177734,
+      "step": 570
+    },
+    {
+      "epoch": 1.4558404558404558,
+      "grad_norm": 0.8635227126945888,
+      "learning_rate": 5.661281951285613e-06,
+      "loss": 0.9539518356323242,
+      "step": 575
+    },
+    {
+      "epoch": 1.468502690724913,
+      "grad_norm": 0.8840260265705664,
+      "learning_rate": 5.592019841532507e-06,
+      "loss": 0.9480253219604492,
+      "step": 580
+    },
+    {
+      "epoch": 1.4811649256093702,
+      "grad_norm": 0.9151680057009149,
+      "learning_rate": 5.522642316338268e-06,
+      "loss": 0.9404661178588867,
+      "step": 585
+    },
+    {
+      "epoch": 1.4938271604938271,
+      "grad_norm": 0.9450262697016882,
+      "learning_rate": 5.453162900988902e-06,
+      "loss": 0.9321787834167481,
+      "step": 590
+    },
+    {
+      "epoch": 1.5064893953782843,
+      "grad_norm": 0.8402436559360018,
+      "learning_rate": 5.383595140634093e-06,
+      "loss": 0.9440553665161133,
+      "step": 595
+    },
+    {
+      "epoch": 1.5191516302627415,
+      "grad_norm": 0.8778976142471068,
+      "learning_rate": 5.3139525976465675e-06,
+      "loss": 0.9511254310607911,
+      "step": 600
+    },
+    {
+      "epoch": 1.5318138651471984,
+      "grad_norm": 0.8781843644707367,
+      "learning_rate": 5.244248848978067e-06,
+      "loss": 0.9387626647949219,
+      "step": 605
+    },
+    {
+      "epoch": 1.5444761000316556,
+      "grad_norm": 0.8642449781808372,
+      "learning_rate": 5.174497483512506e-06,
+      "loss": 0.956205177307129,
+      "step": 610
+    },
+    {
+      "epoch": 1.5571383349161128,
+      "grad_norm": 0.8846802147972775,
+      "learning_rate": 5.1047120994167855e-06,
+      "loss": 0.9363911628723145,
+      "step": 615
+    },
+    {
+      "epoch": 1.5698005698005697,
+      "grad_norm": 0.8739137758439613,
+      "learning_rate": 5.034906301489808e-06,
+      "loss": 0.9367790222167969,
+      "step": 620
+    },
+    {
+      "epoch": 1.5824628046850269,
+      "grad_norm": 0.8953494651595788,
+      "learning_rate": 4.965093698510192e-06,
+      "loss": 0.9425483703613281,
+      "step": 625
+    },
+    {
+      "epoch": 1.595125039569484,
+      "grad_norm": 0.8615421639128288,
+      "learning_rate": 4.895287900583216e-06,
+      "loss": 0.9341062545776367,
+      "step": 630
+    },
+    {
+      "epoch": 1.607787274453941,
+      "grad_norm": 0.8353360832306662,
+      "learning_rate": 4.825502516487497e-06,
+      "loss": 0.949849796295166,
+      "step": 635
+    },
+    {
+      "epoch": 1.6204495093383984,
+      "grad_norm": 0.8563998366304418,
+      "learning_rate": 4.755751151021934e-06,
+      "loss": 0.9409940719604493,
+      "step": 640
+    },
+    {
+      "epoch": 1.6331117442228553,
+      "grad_norm": 0.9360183967885729,
+      "learning_rate": 4.686047402353433e-06,
+      "loss": 0.939891242980957,
+      "step": 645
+    },
+    {
+      "epoch": 1.6457739791073125,
+      "grad_norm": 0.8806457976411894,
+      "learning_rate": 4.6164048593659076e-06,
+      "loss": 0.952726173400879,
+      "step": 650
+    },
+    {
+      "epoch": 1.6584362139917697,
+      "grad_norm": 0.8871650293826654,
+      "learning_rate": 4.546837099011101e-06,
+      "loss": 0.9440122604370117,
+      "step": 655
+    },
+    {
+      "epoch": 1.6710984488762266,
+      "grad_norm": 0.8543495337665787,
+      "learning_rate": 4.477357683661734e-06,
+      "loss": 0.9277559280395508,
+      "step": 660
+    },
+    {
+      "epoch": 1.6837606837606838,
+      "grad_norm": 0.8754310619944701,
+      "learning_rate": 4.4079801584674955e-06,
+      "loss": 0.9328133583068847,
+      "step": 665
+    },
+    {
+      "epoch": 1.696422918645141,
+      "grad_norm": 0.846881206379322,
+      "learning_rate": 4.3387180487143875e-06,
+      "loss": 0.9440486907958985,
+      "step": 670
+    },
+    {
+      "epoch": 1.709085153529598,
+      "grad_norm": 0.8123484252146217,
+      "learning_rate": 4.269584857187942e-06,
+      "loss": 0.9334369659423828,
+      "step": 675
+    },
+    {
+      "epoch": 1.721747388414055,
+      "grad_norm": 0.8860941606484654,
+      "learning_rate": 4.200594061540827e-06,
+      "loss": 0.9386373519897461,
+      "step": 680
+    },
+    {
+      "epoch": 1.7344096232985122,
+      "grad_norm": 0.8710977899292981,
+      "learning_rate": 4.131759111665349e-06,
+      "loss": 0.9379000663757324,
+      "step": 685
+    },
+    {
+      "epoch": 1.7470718581829692,
+      "grad_norm": 0.8989668390644706,
+      "learning_rate": 4.063093427071376e-06,
+      "loss": 0.9351366043090821,
+      "step": 690
+    },
+    {
+      "epoch": 1.7597340930674266,
+      "grad_norm": 0.8426262295188102,
+      "learning_rate": 3.994610394270178e-06,
+      "loss": 0.9458501815795899,
+      "step": 695
+    },
+    {
+      "epoch": 1.7723963279518835,
+      "grad_norm": 0.8490556601435445,
+      "learning_rate": 3.926323364164684e-06,
+      "loss": 0.9344646453857421,
+      "step": 700
+    },
+    {
+      "epoch": 1.7850585628363405,
+      "grad_norm": 0.857013306646358,
+      "learning_rate": 3.8582456494467214e-06,
+      "loss": 0.9324585914611816,
+      "step": 705
+    },
+    {
+      "epoch": 1.7977207977207978,
+      "grad_norm": 0.8442075171060656,
+      "learning_rate": 3.790390522001662e-06,
+      "loss": 0.9345897674560547,
+      "step": 710
+    },
+    {
+      "epoch": 1.8103830326052548,
+      "grad_norm": 0.8635838902214552,
+      "learning_rate": 3.7227712103210485e-06,
+      "loss": 0.9480118751525879,
+      "step": 715
+    },
+    {
+      "epoch": 1.823045267489712,
+      "grad_norm": 0.8701785787205291,
+      "learning_rate": 3.655400896923672e-06,
+      "loss": 0.9411863327026367,
+      "step": 720
+    },
+    {
+      "epoch": 1.8357075023741691,
+      "grad_norm": 0.9278897279843371,
+      "learning_rate": 3.5882927157856175e-06,
+      "loss": 0.9384016036987305,
+      "step": 725
+    },
+    {
+      "epoch": 1.848369737258626,
+      "grad_norm": 0.8675201640437896,
+      "learning_rate": 3.521459749779769e-06,
+      "loss": 0.9388191223144531,
+      "step": 730
+    },
+    {
+      "epoch": 1.8610319721430832,
+      "grad_norm": 0.9047480946293855,
+      "learning_rate": 3.4549150281252635e-06,
+      "loss": 0.943515396118164,
+      "step": 735
+    },
+    {
+      "epoch": 1.8736942070275404,
+      "grad_norm": 0.9100256206799584,
+      "learning_rate": 3.3886715238474454e-06,
+      "loss": 0.9317167282104493,
+      "step": 740
+    },
+    {
+      "epoch": 1.8863564419119974,
+      "grad_norm": 0.9121240599713055,
+      "learning_rate": 3.322742151248726e-06,
+      "loss": 0.9298182487487793,
+      "step": 745
+    },
+    {
+      "epoch": 1.8990186767964545,
+      "grad_norm": 0.8360632961222116,
+      "learning_rate": 3.2571397633909252e-06,
+      "loss": 0.9383123397827149,
+      "step": 750
+    },
+    {
+      "epoch": 1.9116809116809117,
+      "grad_norm": 0.8449980062948027,
+      "learning_rate": 3.1918771495895395e-06,
+      "loss": 0.9380681991577149,
+      "step": 755
+    },
+    {
+      "epoch": 1.9243431465653686,
+      "grad_norm": 0.8358057866853585,
+      "learning_rate": 3.12696703292044e-06,
+      "loss": 0.9311031341552735,
+      "step": 760
+    },
+    {
+      "epoch": 1.937005381449826,
+      "grad_norm": 0.8261214369483678,
+      "learning_rate": 3.0624220677394854e-06,
+      "loss": 0.9335260391235352,
+      "step": 765
+    },
+    {
+      "epoch": 1.949667616334283,
+      "grad_norm": 0.8746978630306859,
+      "learning_rate": 2.9982548372155264e-06,
+      "loss": 0.9282594680786133,
+      "step": 770
+    },
+    {
+      "epoch": 1.9623298512187402,
+      "grad_norm": 0.8914685495920053,
+      "learning_rate": 2.934477850877292e-06,
+      "loss": 0.9267834663391114,
+      "step": 775
+    },
+    {
+      "epoch": 1.9749920861031973,
+      "grad_norm": 0.8730909900000534,
+      "learning_rate": 2.871103542174637e-06,
+      "loss": 0.9400104522705078,
+      "step": 780
+    },
+    {
+      "epoch": 1.9876543209876543,
+      "grad_norm": 0.9195388866817068,
+      "learning_rate": 2.8081442660546126e-06,
+      "loss": 0.9355339050292969,
+      "step": 785
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 0.8941990040688051,
+      "learning_rate": 2.7456122965528475e-06,
+      "loss": 0.9464699745178222,
+      "step": 790
+    },
+    {
+      "epoch": 2.012662234884457,
+      "grad_norm": 0.9551299167570609,
+      "learning_rate": 2.683519824400693e-06,
+      "loss": 0.8369241714477539,
+      "step": 795
+    },
+    {
+      "epoch": 2.0253244697689143,
+      "grad_norm": 0.9503417747763285,
+      "learning_rate": 2.6218789546486235e-06,
+      "loss": 0.8305461883544922,
+      "step": 800
+    },
+    {
+      "epoch": 2.0379867046533713,
+      "grad_norm": 0.9428708677587196,
+      "learning_rate": 2.560701704306336e-06,
+      "loss": 0.8380617141723633,
+      "step": 805
+    },
+    {
+      "epoch": 2.0506489395378282,
+      "grad_norm": 0.9141118129164282,
+      "learning_rate": 2.5000000000000015e-06,
+      "loss": 0.8350756645202637,
+      "step": 810
+    },
+    {
+      "epoch": 2.0633111744222856,
+      "grad_norm": 0.8925703133521584,
+      "learning_rate": 2.4397856756471435e-06,
+      "loss": 0.8253829956054688,
+      "step": 815
+    },
+    {
+      "epoch": 2.0759734093067426,
+      "grad_norm": 0.9010434641718755,
+      "learning_rate": 2.380070470149605e-06,
+      "loss": 0.8296566009521484,
+      "step": 820
+    },
+    {
+      "epoch": 2.0886356441912,
+      "grad_norm": 0.9108639104627096,
+      "learning_rate": 2.320866025105016e-06,
+      "loss": 0.8311027526855469,
+      "step": 825
+    },
+    {
+      "epoch": 2.101297879075657,
+      "grad_norm": 0.8776111588691332,
+      "learning_rate": 2.2621838825372496e-06,
+      "loss": 0.8341006278991699,
+      "step": 830
+    },
+    {
+      "epoch": 2.113960113960114,
+      "grad_norm": 0.9815553215946904,
+      "learning_rate": 2.204035482646267e-06,
+      "loss": 0.8500799179077149,
+      "step": 835
+    },
+    {
+      "epoch": 2.1266223488445712,
+      "grad_norm": 0.9142581836968815,
+      "learning_rate": 2.146432161577842e-06,
+      "loss": 0.8405605316162109,
+      "step": 840
+    },
+    {
+      "epoch": 2.139284583729028,
+      "grad_norm": 1.145797436281295,
+      "learning_rate": 2.0893851492135536e-06,
+      "loss": 0.8333783149719238,
+      "step": 845
+    },
+    {
+      "epoch": 2.151946818613485,
+      "grad_norm": 1.0132915822668673,
+      "learning_rate": 2.0329055669814936e-06,
+      "loss": 0.8394683837890625,
+      "step": 850
+    },
+    {
+      "epoch": 2.1646090534979425,
+      "grad_norm": 1.054834919380244,
+      "learning_rate": 1.977004425688126e-06,
+      "loss": 0.8199810028076172,
+      "step": 855
+    },
+    {
+      "epoch": 2.1772712883823995,
+      "grad_norm": 0.8659196804614238,
+      "learning_rate": 1.9216926233717087e-06,
+      "loss": 0.8200090408325196,
+      "step": 860
+    },
+    {
+      "epoch": 2.1899335232668564,
+      "grad_norm": 0.8936160842705467,
+      "learning_rate": 1.8669809431776991e-06,
+      "loss": 0.819823932647705,
+      "step": 865
+    },
+    {
+      "epoch": 2.202595758151314,
+      "grad_norm": 0.8936829197489651,
+      "learning_rate": 1.8128800512565514e-06,
+      "loss": 0.8329672813415527,
+      "step": 870
+    },
+    {
+      "epoch": 2.2152579930357708,
+      "grad_norm": 0.865124596235692,
+      "learning_rate": 1.7594004946843458e-06,
+      "loss": 0.830903434753418,
+      "step": 875
+    },
+    {
+      "epoch": 2.2279202279202277,
+      "grad_norm": 1.0004908923945968,
+      "learning_rate": 1.7065526994065973e-06,
+      "loss": 0.8222661972045898,
+      "step": 880
+    },
+    {
+      "epoch": 2.240582462804685,
+      "grad_norm": 0.9974019476784539,
+      "learning_rate": 1.6543469682057105e-06,
+      "loss": 0.8375696182250977,
+      "step": 885
+    },
+    {
+      "epoch": 2.253244697689142,
+      "grad_norm": 0.9489943802555122,
+      "learning_rate": 1.6027934786924187e-06,
+      "loss": 0.8297539710998535,
+      "step": 890
+    },
+    {
+      "epoch": 2.2659069325735994,
+      "grad_norm": 0.8526558052313017,
+      "learning_rate": 1.551902281321651e-06,
+      "loss": 0.8450464248657227,
+      "step": 895
+    },
+    {
+      "epoch": 2.2785691674580564,
+      "grad_norm": 0.9095006101158244,
+      "learning_rate": 1.5016832974331725e-06,
+      "loss": 0.8434087753295898,
+      "step": 900
+    },
+    {
+      "epoch": 2.2912314023425133,
+      "grad_norm": 0.8941646461803728,
+      "learning_rate": 1.4521463173173966e-06,
+      "loss": 0.8199748992919922,
+      "step": 905
+    },
+    {
+      "epoch": 2.3038936372269707,
+      "grad_norm": 1.0151629908802393,
+      "learning_rate": 1.4033009983067454e-06,
+      "loss": 0.8257926940917969,
+      "step": 910
+    },
+    {
+      "epoch": 2.3165558721114277,
+      "grad_norm": 0.8688789281578927,
+      "learning_rate": 1.3551568628929434e-06,
+      "loss": 0.8222599029541016,
+      "step": 915
+    },
+    {
+      "epoch": 2.3292181069958846,
+      "grad_norm": 0.9256160248355862,
+      "learning_rate": 1.3077232968705805e-06,
+      "loss": 0.8179254531860352,
+      "step": 920
+    },
+    {
+      "epoch": 2.341880341880342,
+      "grad_norm": 0.8992368646832662,
+      "learning_rate": 1.2610095475073415e-06,
+      "loss": 0.8351934432983399,
+      "step": 925
+    },
+    {
+      "epoch": 2.354542576764799,
+      "grad_norm": 0.8950940935118609,
+      "learning_rate": 1.2150247217412186e-06,
+      "loss": 0.8317380905151367,
+      "step": 930
+    },
+    {
+      "epoch": 2.3672048116492563,
+      "grad_norm": 0.8832980241323902,
+      "learning_rate": 1.1697777844051105e-06,
+      "loss": 0.8315254211425781,
+      "step": 935
+    },
+    {
+      "epoch": 2.3798670465337133,
+      "grad_norm": 0.9169712891318174,
+      "learning_rate": 1.1252775564791023e-06,
+      "loss": 0.8270421981811523,
+      "step": 940
+    },
+    {
+      "epoch": 2.3925292814181702,
+      "grad_norm": 0.8820889479067028,
+      "learning_rate": 1.0815327133708015e-06,
+      "loss": 0.8373619079589844,
+      "step": 945
+    },
+    {
+      "epoch": 2.4051915163026276,
+      "grad_norm": 0.8633797543130399,
+      "learning_rate": 1.0385517832240472e-06,
+      "loss": 0.822084617614746,
+      "step": 950
+    },
+    {
+      "epoch": 2.4178537511870846,
+      "grad_norm": 0.883673978191503,
+      "learning_rate": 9.963431452563331e-07,
+      "loss": 0.8369743347167968,
+      "step": 955
+    },
+    {
+      "epoch": 2.4305159860715415,
+      "grad_norm": 0.8729883350584554,
+      "learning_rate": 9.549150281252633e-07,
+      "loss": 0.8232148170471192,
+      "step": 960
+    },
+    {
+      "epoch": 2.443178220955999,
+      "grad_norm": 0.9022268286398805,
+      "learning_rate": 9.142755083243577e-07,
+      "loss": 0.8386312484741211,
+      "step": 965
+    },
+    {
+      "epoch": 2.455840455840456,
+      "grad_norm": 0.886835830092829,
+      "learning_rate": 8.744325086085248e-07,
+      "loss": 0.8283025741577148,
+      "step": 970
+    },
+    {
+      "epoch": 2.468502690724913,
+      "grad_norm": 0.8893818705075751,
+      "learning_rate": 8.353937964495029e-07,
+      "loss": 0.8303911209106445,
+      "step": 975
+    },
+    {
+      "epoch": 2.48116492560937,
+      "grad_norm": 0.922324297214932,
+      "learning_rate": 7.971669825215789e-07,
+      "loss": 0.836126708984375,
+      "step": 980
+    },
+    {
+      "epoch": 2.493827160493827,
+      "grad_norm": 0.904255523759268,
+      "learning_rate": 7.597595192178702e-07,
+      "loss": 0.8196451187133789,
+      "step": 985
+    },
+    {
+      "epoch": 2.506489395378284,
+      "grad_norm": 0.8636934543130641,
+      "learning_rate": 7.23178699197467e-07,
+      "loss": 0.8335494995117188,
+      "step": 990
+    },
+    {
+      "epoch": 2.5191516302627415,
+      "grad_norm": 0.897039019918768,
+      "learning_rate": 6.874316539637127e-07,
+      "loss": 0.8088079452514648,
+      "step": 995
+    },
+    {
+      "epoch": 2.5318138651471984,
+      "grad_norm": 0.8864019874531589,
+      "learning_rate": 6.52525352473905e-07,
+      "loss": 0.8233877182006836,
+      "step": 1000
+    },
+    {
+      "epoch": 2.5444761000316554,
+      "grad_norm": 0.9088661368290617,
+      "learning_rate": 6.184665997806832e-07,
+      "loss": 0.8182021141052246,
+      "step": 1005
+    },
+    {
+      "epoch": 2.5571383349161128,
+      "grad_norm": 0.9072624944819723,
+      "learning_rate": 5.852620357053651e-07,
+      "loss": 0.835714054107666,
+      "step": 1010
+    },
+    {
+      "epoch": 2.5698005698005697,
+      "grad_norm": 0.8731742132357438,
+      "learning_rate": 5.529181335435124e-07,
+      "loss": 0.8283638000488281,
+      "step": 1015
+    },
+    {
+      "epoch": 2.5824628046850266,
+      "grad_norm": 0.8761704900525855,
+      "learning_rate": 5.214411988029355e-07,
+      "loss": 0.828251838684082,
+      "step": 1020
+    },
+    {
+      "epoch": 2.595125039569484,
+      "grad_norm": 0.8629998756845685,
+      "learning_rate": 4.908373679744316e-07,
+      "loss": 0.8239392280578614,
+      "step": 1025
+    },
+    {
+      "epoch": 2.607787274453941,
+      "grad_norm": 0.9025713028894049,
+      "learning_rate": 4.6111260733545714e-07,
+      "loss": 0.8368805885314942,
+      "step": 1030
+    },
+    {
+      "epoch": 2.6204495093383984,
+      "grad_norm": 0.8791508721949534,
+      "learning_rate": 4.322727117869951e-07,
+      "loss": 0.8214786529541016,
+      "step": 1035
+    },
+    {
+      "epoch": 2.6331117442228553,
+      "grad_norm": 0.8747271828916487,
+      "learning_rate": 4.043233037238281e-07,
+      "loss": 0.8331809997558594,
+      "step": 1040
+    },
+    {
+      "epoch": 2.6457739791073127,
+      "grad_norm": 0.9189023842289675,
+      "learning_rate": 3.772698319384349e-07,
+      "loss": 0.8299878120422364,
+      "step": 1045
+    },
+    {
+      "epoch": 2.6584362139917697,
+      "grad_norm": 0.9012554611673713,
+      "learning_rate": 3.511175705587433e-07,
+      "loss": 0.8398582458496093,
+      "step": 1050
+    },
+    {
+      "epoch": 2.6710984488762266,
+      "grad_norm": 0.8857980961838654,
+      "learning_rate": 3.258716180199278e-07,
+      "loss": 0.818387794494629,
+      "step": 1055
+    },
+    {
+      "epoch": 2.683760683760684,
+      "grad_norm": 0.8627721513188427,
+      "learning_rate": 3.015368960704584e-07,
+      "loss": 0.8408231735229492,
+      "step": 1060
+    },
+    {
+      "epoch": 2.696422918645141,
+      "grad_norm": 1.0338964144567573,
+      "learning_rate": 2.7811814881259503e-07,
+      "loss": 0.8292581558227539,
+      "step": 1065
+    },
+    {
+      "epoch": 2.709085153529598,
+      "grad_norm": 0.8858535803633547,
+      "learning_rate": 2.556199417775174e-07,
+      "loss": 0.8229169845581055,
+      "step": 1070
+    },
+    {
+      "epoch": 2.7217473884140553,
+      "grad_norm": 0.8487724578022646,
+      "learning_rate": 2.3404666103526542e-07,
+      "loss": 0.8243260383605957,
+      "step": 1075
+    },
+    {
+      "epoch": 2.7344096232985122,
+      "grad_norm": 0.9142856277892702,
+      "learning_rate": 2.134025123396638e-07,
+      "loss": 0.8411771774291992,
+      "step": 1080
+    },
+    {
+      "epoch": 2.747071858182969,
+      "grad_norm": 0.9044836277079623,
+      "learning_rate": 1.9369152030840553e-07,
+      "loss": 0.8182785034179687,
+      "step": 1085
+    },
+    {
+      "epoch": 2.7597340930674266,
+      "grad_norm": 0.8753811947010942,
+      "learning_rate": 1.7491752763844294e-07,
+      "loss": 0.8330059051513672,
+      "step": 1090
+    },
+    {
+      "epoch": 2.7723963279518835,
+      "grad_norm": 0.9127704993164228,
+      "learning_rate": 1.5708419435684463e-07,
+      "loss": 0.8270849227905274,
+      "step": 1095
+    },
+    {
+      "epoch": 2.7850585628363405,
+      "grad_norm": 0.8885180442416001,
+      "learning_rate": 1.4019499710726913e-07,
+      "loss": 0.8345333099365234,
+      "step": 1100
+    },
+    {
+      "epoch": 2.797720797720798,
+      "grad_norm": 0.9121582122101339,
+      "learning_rate": 1.2425322847218368e-07,
+      "loss": 0.8229399681091308,
+      "step": 1105
+    },
+    {
+      "epoch": 2.810383032605255,
+      "grad_norm": 0.9127878209504691,
+      "learning_rate": 1.0926199633097156e-07,
+      "loss": 0.82467041015625,
+      "step": 1110
+    },
+    {
+      "epoch": 2.8230452674897117,
+      "grad_norm": 1.744776544604879,
+      "learning_rate": 9.522422325404234e-08,
+      "loss": 0.8274450302124023,
+      "step": 1115
+    },
+    {
+      "epoch": 2.835707502374169,
+      "grad_norm": 0.8982447409078338,
+      "learning_rate": 8.214264593307097e-08,
+      "loss": 0.8290293693542481,
+      "step": 1120
+    },
+    {
+      "epoch": 2.848369737258626,
+      "grad_norm": 0.8866891213730514,
+      "learning_rate": 7.001981464747565e-08,
+      "loss": 0.8212656021118164,
+      "step": 1125
+    },
+    {
+      "epoch": 2.861031972143083,
+      "grad_norm": 0.9114969683085143,
+      "learning_rate": 5.8858092767236084e-08,
+      "loss": 0.8231026649475097,
+      "step": 1130
+    },
+    {
+      "epoch": 2.8736942070275404,
+      "grad_norm": 0.8832450189167574,
+      "learning_rate": 4.865965629214819e-08,
+      "loss": 0.830931282043457,
+      "step": 1135
+    },
+    {
+      "epoch": 2.8863564419119974,
+      "grad_norm": 0.8487481576996411,
+      "learning_rate": 3.9426493427611177e-08,
+      "loss": 0.8255987167358398,
+      "step": 1140
+    },
+    {
+      "epoch": 2.8990186767964543,
+      "grad_norm": 0.879731012515658,
+      "learning_rate": 3.1160404197018155e-08,
+      "loss": 0.8359064102172852,
+      "step": 1145
+    },
+    {
+      "epoch": 2.9116809116809117,
+      "grad_norm": 0.9058610366933287,
+      "learning_rate": 2.386300009084408e-08,
+      "loss": 0.8246042251586914,
+      "step": 1150
+    },
+    {
+      "epoch": 2.9243431465653686,
+      "grad_norm": 0.898984738606933,
+      "learning_rate": 1.753570375247815e-08,
+      "loss": 0.8313743591308593,
+      "step": 1155
+    },
+    {
+      "epoch": 2.937005381449826,
+      "grad_norm": 0.8885810042812119,
+      "learning_rate": 1.2179748700879013e-08,
+      "loss": 0.829072380065918,
+      "step": 1160
+    },
+    {
+      "epoch": 2.949667616334283,
+      "grad_norm": 0.903649383117039,
+      "learning_rate": 7.796179090094891e-09,
+      "loss": 0.8449357986450196,
+      "step": 1165
+    },
+    {
+      "epoch": 2.9623298512187404,
+      "grad_norm": 0.9753427783203841,
+      "learning_rate": 4.385849505708084e-09,
+      "loss": 0.8176769256591797,
+      "step": 1170
+    },
+    {
+      "epoch": 2.9749920861031973,
+      "grad_norm": 0.8748130538451198,
+      "learning_rate": 1.9494247982282386e-09,
+      "loss": 0.8217670440673828,
+      "step": 1175
+    },
+    {
+      "epoch": 2.9876543209876543,
+      "grad_norm": 0.9286601621176515,
+      "learning_rate": 4.87379953478806e-10,
+      "loss": 0.8410984992980957,
+      "step": 1180
+    },
+    {
+      "epoch": 3.0,
+      "grad_norm": 0.9389103063767028,
+      "learning_rate": 0.0,
+      "loss": 0.8439925193786622,
+      "step": 1185
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 1185,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.9001299950934426e+17,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/training_args.bin b/training_args.bin
new file mode 100644
index 0000000..bd1651f
--- /dev/null
+++ b/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d02cf2a49c614bff625160d51a72e2126e582e1938773da64ec29b678ba890ae
+size 9553
diff --git a/vocab.json b/vocab.json
new file mode 100644
index 0000000..6c49fc6
--- /dev/null
+++ b/vocab.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ca10d7e9fb3ed18575dd1e277a2579c16d108e32f27439684afa0e10b1440910
+size 2776833