初始化项目,由ModelHub XC社区提供模型
Model: richardyoung/DeepSeek-R1-Distill-Qwen-7B-abliterated-obliteratus Source: Original Platform
This commit is contained in:
56
abliteration_metadata.json
Normal file
56
abliteration_metadata.json
Normal file
@@ -0,0 +1,56 @@
|
||||
{
|
||||
"source_model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
|
||||
"technique": "refusal_direction_ablation",
|
||||
"method": "advanced",
|
||||
"method_config": {
|
||||
"n_directions": 4,
|
||||
"direction_method": "svd",
|
||||
"norm_preserve": true,
|
||||
"regularization": 0.3,
|
||||
"refinement_passes": 2,
|
||||
"project_biases": true,
|
||||
"use_chat_template": true,
|
||||
"use_whitened_svd": false,
|
||||
"true_iterative_refinement": false,
|
||||
"winsorize_activations": false,
|
||||
"float_layer_interpolation": false,
|
||||
"cot_aware": false,
|
||||
"use_kl_optimization": false,
|
||||
"use_lora_ablation": false,
|
||||
"spectral_cascade": false,
|
||||
"spectral_bands": 3,
|
||||
"spectral_threshold": 0.05
|
||||
},
|
||||
"references": [
|
||||
"Arditi et al., Refusal in Language Models Is Mediated by a Single Direction (NeurIPS 2024)",
|
||||
"Gabliteration: SVD-based multi-direction extraction (arXiv:2512.18901)",
|
||||
"Norm-Preserving Biprojected Abliteration (grimjim, 2025)",
|
||||
"Young, Comparative Analysis of LLM Abliteration Methods (arXiv:2512.13655)",
|
||||
"Joad et al., More to Refusal than a Single Direction (2026)",
|
||||
"Heretic (p-e-w, 2025): Bayesian optimization, LoRA-mediated ablation, winsorization",
|
||||
"OBLITERATUS: Whitened SVD, EGA, CoT-aware, KL co-optimization, float interpolation (novel)"
|
||||
],
|
||||
"strong_layers": [
|
||||
27,
|
||||
26,
|
||||
25,
|
||||
24,
|
||||
23,
|
||||
22,
|
||||
21,
|
||||
20
|
||||
],
|
||||
"n_harmful_prompts": 512,
|
||||
"n_harmless_prompts": 512,
|
||||
"quality_metrics": {
|
||||
"perplexity": 47.104726995215096,
|
||||
"coherence": 1.0,
|
||||
"refusal_rate": 0.03333333333333333,
|
||||
"kl_divergence": 0.4912732243537903,
|
||||
"spectral_certification": "RED"
|
||||
},
|
||||
"kl_contributions": {},
|
||||
"cot_preserved_layers": [],
|
||||
"float_layer_weights": {},
|
||||
"lora_adapters_saved": false
|
||||
}
|
||||
Reference in New Issue
Block a user