初始化项目,由ModelHub XC社区提供模型
Model: cs-552-2026-claude-bots/group_model Source: Original Platform
This commit is contained in:
113
mergekit_config.yml
Normal file
113
mergekit_config.yml
Normal file
@@ -0,0 +1,113 @@
|
||||
# TIES merge configuration for 4 specialized Qwen fine-tunes
|
||||
# Goal:
|
||||
# - Preserve strong domain expertise from each model
|
||||
# - Reduce destructive interference between skills
|
||||
# - Keep general reasoning stability from the base model
|
||||
#
|
||||
# Recommended for:
|
||||
# - Qwen2 / Qwen2.5 instruction-tuned variants
|
||||
# - Same architecture + same parameter count
|
||||
# - Same tokenizer + same base checkpoint lineage
|
||||
|
||||
models:
|
||||
# ---------------------------
|
||||
# Math specialist
|
||||
# ---------------------------
|
||||
- model: cs-552-2026-claude-bots/math_model
|
||||
parameters:
|
||||
# High density because math capabilities are usually sparse
|
||||
# and easily lost during merging
|
||||
density:
|
||||
- filter: self_attn
|
||||
value: 0.72
|
||||
- filter: mlp
|
||||
value: 0.82
|
||||
|
||||
# Strong contribution in reasoning-heavy blocks
|
||||
weight:
|
||||
- filter: self_attn
|
||||
value: 1.25
|
||||
- filter: mlp
|
||||
value: 1.15
|
||||
- value: 1.10
|
||||
|
||||
# ---------------------------
|
||||
# Knowledge / factual model
|
||||
# ---------------------------
|
||||
- model: cs-552-2026-claude-bots/general_knowledge_model
|
||||
parameters:
|
||||
# Moderate density:
|
||||
# factual tuning tends to be more distributed
|
||||
density: 0.58
|
||||
|
||||
# Slightly lower than math to avoid overwriting reasoning
|
||||
weight:
|
||||
- filter: self_attn
|
||||
value: 1.00
|
||||
- filter: mlp
|
||||
value: 0.95
|
||||
- value: 0.95
|
||||
|
||||
# ---------------------------
|
||||
# Multilingual specialist
|
||||
# ---------------------------
|
||||
- model: cs-552-2026-claude-bots/multilingual_model
|
||||
parameters:
|
||||
# Language capabilities are often spread broadly,
|
||||
# so use reasonably high density
|
||||
density:
|
||||
- filter: embed_tokens
|
||||
value: 0.90
|
||||
- filter: self_attn
|
||||
value: 0.68
|
||||
- filter: mlp
|
||||
value: 0.62
|
||||
|
||||
# Stronger influence on embeddings and attention
|
||||
weight:
|
||||
- filter: embed_tokens
|
||||
value: 1.30
|
||||
- filter: self_attn
|
||||
value: 1.10
|
||||
- value: 1.00
|
||||
|
||||
# ---------------------------
|
||||
# Safety / alignment model
|
||||
# ---------------------------
|
||||
- model: cs-552-2026-claude-bots/safety_model
|
||||
parameters:
|
||||
# Lower density prevents excessive refusals
|
||||
# while still preserving alignment behavior
|
||||
density: 0.34
|
||||
|
||||
# Important but intentionally constrained
|
||||
weight:
|
||||
- filter: self_attn
|
||||
value: 0.82
|
||||
- filter: mlp
|
||||
value: 0.72
|
||||
- value: 0.75
|
||||
|
||||
merge_method: ties
|
||||
|
||||
# IMPORTANT:
|
||||
# Use the ORIGINAL shared pretrained base model
|
||||
# from which all four fine-tunes were derived.
|
||||
base_model: Qwen/Qwen3-1.7B
|
||||
|
||||
parameters:
|
||||
# Critical for TIES stability
|
||||
normalize: true
|
||||
|
||||
# Helps reduce memory usage and improves masking behavior
|
||||
int8_mask: true
|
||||
|
||||
# Trim very small parameter deltas
|
||||
# Good default for 4-way merges
|
||||
prune_threshold: 0.015
|
||||
|
||||
dtype: bfloat16
|
||||
|
||||
# Optional:
|
||||
# tokenizer_source: base
|
||||
# chat_template: auto
|
||||
Reference in New Issue
Block a user