113 lines
2.9 KiB
YAML
113 lines
2.9 KiB
YAML
# TIES merge configuration for 4 specialized Qwen fine-tunes
|
|
# Goal:
|
|
# - Preserve strong domain expertise from each model
|
|
# - Reduce destructive interference between skills
|
|
# - Keep general reasoning stability from the base model
|
|
#
|
|
# Recommended for:
|
|
# - Qwen2 / Qwen2.5 instruction-tuned variants
|
|
# - Same architecture + same parameter count
|
|
# - Same tokenizer + same base checkpoint lineage
|
|
|
|
models:
|
|
# ---------------------------
|
|
# Math specialist
|
|
# ---------------------------
|
|
- model: cs-552-2026-claude-bots/math_model
|
|
parameters:
|
|
# High density because math capabilities are usually sparse
|
|
# and easily lost during merging
|
|
density:
|
|
- filter: self_attn
|
|
value: 0.72
|
|
- filter: mlp
|
|
value: 0.82
|
|
|
|
# Strong contribution in reasoning-heavy blocks
|
|
weight:
|
|
- filter: self_attn
|
|
value: 1.25
|
|
- filter: mlp
|
|
value: 1.15
|
|
- value: 1.10
|
|
|
|
# ---------------------------
|
|
# Knowledge / factual model
|
|
# ---------------------------
|
|
- model: cs-552-2026-claude-bots/general_knowledge_model
|
|
parameters:
|
|
# Moderate density:
|
|
# factual tuning tends to be more distributed
|
|
density: 0.58
|
|
|
|
# Slightly lower than math to avoid overwriting reasoning
|
|
weight:
|
|
- filter: self_attn
|
|
value: 1.00
|
|
- filter: mlp
|
|
value: 0.95
|
|
- value: 0.95
|
|
|
|
# ---------------------------
|
|
# Multilingual specialist
|
|
# ---------------------------
|
|
- model: cs-552-2026-claude-bots/multilingual_model
|
|
parameters:
|
|
# Language capabilities are often spread broadly,
|
|
# so use reasonably high density
|
|
density:
|
|
- filter: embed_tokens
|
|
value: 0.90
|
|
- filter: self_attn
|
|
value: 0.68
|
|
- filter: mlp
|
|
value: 0.62
|
|
|
|
# Stronger influence on embeddings and attention
|
|
weight:
|
|
- filter: embed_tokens
|
|
value: 1.30
|
|
- filter: self_attn
|
|
value: 1.10
|
|
- value: 1.00
|
|
|
|
# ---------------------------
|
|
# Safety / alignment model
|
|
# ---------------------------
|
|
- model: cs-552-2026-claude-bots/safety_model
|
|
parameters:
|
|
# Lower density prevents excessive refusals
|
|
# while still preserving alignment behavior
|
|
density: 0.34
|
|
|
|
# Important but intentionally constrained
|
|
weight:
|
|
- filter: self_attn
|
|
value: 0.82
|
|
- filter: mlp
|
|
value: 0.72
|
|
- value: 0.75
|
|
|
|
merge_method: ties
|
|
|
|
# IMPORTANT:
|
|
# Use the ORIGINAL shared pretrained base model
|
|
# from which all four fine-tunes were derived.
|
|
base_model: Qwen/Qwen3-1.7B
|
|
|
|
parameters:
|
|
# Critical for TIES stability
|
|
normalize: true
|
|
|
|
# Helps reduce memory usage and improves masking behavior
|
|
int8_mask: true
|
|
|
|
# Trim very small parameter deltas
|
|
# Good default for 4-way merges
|
|
prune_threshold: 0.015
|
|
|
|
dtype: bfloat16
|
|
|
|
# Optional:
|
|
# tokenizer_source: base
|
|
# chat_template: auto |