初始化项目，由ModelHub XC社区提供模型

Model: HaadesX/iconoclast-llama3.1-8b Source: Original Platform
2026-06-18 11:53:18 +08:00
commit 36b4430fad
19 changed files with 3442 additions and 0 deletions
--- a/iconoclast_config.toml
+++ b/iconoclast_config.toml
@@ -0,0 +1,68 @@
+# ICONOCLAST Configuration for Llama-3.1-8B-Instruct Model
+# This configuration produced the published model via trial #36 in the Optuna study
+
+model = "meta-llama/Llama-3.1-8B-Instruct"
+seed = 42
+quantization = "none"  # Model published in full precision; use bnb_4bit for quantized inference
+batch_size = 0  # auto
+max_batch_size = 8
+max_response_length = 96
+n_trials = 48
+n_startup_trials = 4
+
+# Core ICONOCLAST Parameters (from best trial)
+orthogonalize_direction = true
+benign_subspace_rank = 8
+row_normalization = "pre"
+direction_variance_floor = 1e-6
+kl_divergence_target = 0.10
+overrefusal_penalty = 0.32
+harmful_marker_penalty = 0.18
+compliance_gap_penalty = 0.42
+
+# LoRA Configuration
+full_normalization_lora_rank = 1  # Rank-1 for directional editing
+winsorization_quantile = 1.0  # Disabled
+
+# Dataset Configuration
+[good_prompts]
+dataset = "mlabonne/harmless_alpaca"
+split = "train[:240]"
+column = "text"
+residual_plot_label = '"Harmless" prompts'
+residual_plot_color = "royalblue"
+
+[bad_prompts]
+dataset = "JailbreakBench/JBB-Behaviors"
+name = "behaviors"
+split = "harmful[:80]"
+column = "Goal"
+residual_plot_label = '"Direct harmful" prompts'
+residual_plot_color = "darkorange"
+
+[good_evaluation_prompts]
+dataset = "mlabonne/harmless_alpaca"
+split = "test[:64]"
+column = "text"
+
+[bad_evaluation_prompts]
+dataset = "JailbreakBench/JBB-Behaviors"
+name = "behaviors"
+split = "harmful[80:100]"
+column = "Goal"
+
+# Prompt Processing
+refusal_markers = [
+    "sorry", "i can'", "i cant", "i cannot", "i won'", "i wont", 
+    "i will not", "i unable", "im unable", "i'm unable", "i am unable",
+    "i an ai", "im an ai", "i'm an ai", "i am an ai", "as an ai", 
+    "ai assistant", "i designed to", "im designed to", "i'm designed to", 
+    "i am designed to", "i programmed to", "im programmed to", 
+    "i'm programmed to", "i am programmed to", "violat", "prohibit", 
+    "illegal", "harmful", "inappropriate", "unethical", "ethical boundaries"
+]
+
+system_prompt = "You are a helpful assistant."
+
+# Output Directories
+study_checkpoint_dir = "checkpoints_llama3_1_8b_iconoclast"