# TIES merge configuration for 4 specialized Qwen fine-tunes # Goal: # - Preserve strong domain expertise from each model # - Reduce destructive interference between skills # - Keep general reasoning stability from the base model # # Recommended for: # - Qwen2 / Qwen2.5 instruction-tuned variants # - Same architecture + same parameter count # - Same tokenizer + same base checkpoint lineage models: # --------------------------- # Math specialist # --------------------------- - model: cs-552-2026-claude-bots/math_model parameters: # High density because math capabilities are usually sparse # and easily lost during merging density: - filter: self_attn value: 0.72 - filter: mlp value: 0.82 # Strong contribution in reasoning-heavy blocks weight: - filter: self_attn value: 1.25 - filter: mlp value: 1.15 - value: 1.10 # --------------------------- # Knowledge / factual model # --------------------------- - model: cs-552-2026-claude-bots/general_knowledge_model parameters: # Moderate density: # factual tuning tends to be more distributed density: 0.58 # Slightly lower than math to avoid overwriting reasoning weight: - filter: self_attn value: 1.00 - filter: mlp value: 0.95 - value: 0.95 # --------------------------- # Multilingual specialist # --------------------------- - model: cs-552-2026-claude-bots/multilingual_model parameters: # Language capabilities are often spread broadly, # so use reasonably high density density: - filter: embed_tokens value: 0.90 - filter: self_attn value: 0.68 - filter: mlp value: 0.62 # Stronger influence on embeddings and attention weight: - filter: embed_tokens value: 1.30 - filter: self_attn value: 1.10 - value: 1.00 # --------------------------- # Safety / alignment model # --------------------------- - model: cs-552-2026-claude-bots/safety_model parameters: # Lower density prevents excessive refusals # while still preserving alignment behavior density: 0.34 # Important but intentionally constrained weight: - filter: self_attn value: 0.82 - filter: mlp value: 0.72 - value: 0.75 merge_method: ties # IMPORTANT: # Use the ORIGINAL shared pretrained base model # from which all four fine-tunes were derived. base_model: Qwen/Qwen3-1.7B parameters: # Critical for TIES stability normalize: true # Helps reduce memory usage and improves masking behavior int8_mask: true # Trim very small parameter deltas # Good default for 4-way merges prune_threshold: 0.015 dtype: bfloat16 # Optional: # tokenizer_source: base # chat_template: auto