Files
iconoclast-llama3.1-8b/iconoclast_config.toml
ModelHub XC 36b4430fad 初始化项目,由ModelHub XC社区提供模型
Model: HaadesX/iconoclast-llama3.1-8b
Source: Original Platform
2026-06-18 11:53:18 +08:00

68 lines
2.0 KiB
TOML

# ICONOCLAST Configuration for Llama-3.1-8B-Instruct Model
# This configuration produced the published model via trial #36 in the Optuna study
model = "meta-llama/Llama-3.1-8B-Instruct"
seed = 42
quantization = "none" # Model published in full precision; use bnb_4bit for quantized inference
batch_size = 0 # auto
max_batch_size = 8
max_response_length = 96
n_trials = 48
n_startup_trials = 4
# Core ICONOCLAST Parameters (from best trial)
orthogonalize_direction = true
benign_subspace_rank = 8
row_normalization = "pre"
direction_variance_floor = 1e-6
kl_divergence_target = 0.10
overrefusal_penalty = 0.32
harmful_marker_penalty = 0.18
compliance_gap_penalty = 0.42
# LoRA Configuration
full_normalization_lora_rank = 1 # Rank-1 for directional editing
winsorization_quantile = 1.0 # Disabled
# Dataset Configuration
[good_prompts]
dataset = "mlabonne/harmless_alpaca"
split = "train[:240]"
column = "text"
residual_plot_label = '"Harmless" prompts'
residual_plot_color = "royalblue"
[bad_prompts]
dataset = "JailbreakBench/JBB-Behaviors"
name = "behaviors"
split = "harmful[:80]"
column = "Goal"
residual_plot_label = '"Direct harmful" prompts'
residual_plot_color = "darkorange"
[good_evaluation_prompts]
dataset = "mlabonne/harmless_alpaca"
split = "test[:64]"
column = "text"
[bad_evaluation_prompts]
dataset = "JailbreakBench/JBB-Behaviors"
name = "behaviors"
split = "harmful[80:100]"
column = "Goal"
# Prompt Processing
refusal_markers = [
"sorry", "i can'", "i cant", "i cannot", "i won'", "i wont",
"i will not", "i unable", "im unable", "i'm unable", "i am unable",
"i an ai", "im an ai", "i'm an ai", "i am an ai", "as an ai",
"ai assistant", "i designed to", "im designed to", "i'm designed to",
"i am designed to", "i programmed to", "im programmed to",
"i'm programmed to", "i am programmed to", "violat", "prohibit",
"illegal", "harmful", "inappropriate", "unethical", "ethical boundaries"
]
system_prompt = "You are a helpful assistant."
# Output Directories
study_checkpoint_dir = "checkpoints_llama3_1_8b_iconoclast"