From b6bff73e2172d242c2448474b321aeb5bdbc0f2d Mon Sep 17 00:00:00 2001 From: ModelHub XC Date: Fri, 12 Jun 2026 17:28:16 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96=E9=A1=B9=E7=9B=AE?= =?UTF-8?q?=EF=BC=8C=E7=94=B1ModelHub=20XC=E7=A4=BE=E5=8C=BA=E6=8F=90?= =?UTF-8?q?=E4=BE=9B=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Model: wangzhang/Llama-3-8B-Instruct-DeepRefusal-Broken Source: Original Platform --- .gitattributes | 36 +++++++++ README.md | 178 +++++++++++++++++++++++++++++++++++++++++ chat_template.jinja | 5 ++ config.json | 32 ++++++++ generation_config.json | 9 +++ model.safetensors | 3 + tokenizer.json | 3 + tokenizer_config.json | 15 ++++ 8 files changed, 281 insertions(+) create mode 100644 .gitattributes create mode 100644 README.md create mode 100644 chat_template.jinja create mode 100644 config.json create mode 100644 generation_config.json create mode 100644 model.safetensors create mode 100644 tokenizer.json create mode 100644 tokenizer_config.json diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..52373fe --- /dev/null +++ b/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..d2e9f07 --- /dev/null +++ b/README.md @@ -0,0 +1,178 @@ +--- +license: llama3 +library_name: transformers +pipeline_tag: text-generation +base_model: skysys00/Meta-Llama-3-8B-Instruct-DeepRefusal +tags: + - abliterated + - uncensored + - abliterix + - deeprefusal + - red-team +language: + - en + - zh +--- + +# Llama-3-8B-Instruct-DeepRefusal-Broken + +**DeepRefusal's refusal direction defense, broken by [abliterix](https://github.com/wuwangzhang1216/abliterix) — where every other public attack failed.** + +This model is produced from `skysys00/Meta-Llama-3-8B-Instruct-DeepRefusal`, the +defended release accompanying *"Beyond Surface Alignment: Rebuilding LLMs Safety +Mechanism via Probabilistically Ablating Refusal Direction"* +([arXiv:2509.15202](https://arxiv.org/abs/2509.15202), EMNLP 2025 Findings, Xie et al.). + +The DeepRefusal paper is explicit about its claims: + +> [2026/04/09] We evaluated [heretic](https://github.com/p-e-w/heretic), presently +> the most prominent LLM censorship removal tool, and discovered—somewhat +> unexpectedly—that our approach exhibits strong resilience against such +> attacks. **Adversaries appear unable to circumvent the model's built-in safety +> guardrails without triggering severe performance collapse.** + +abliterix falsifies this. **89% ASR. 14/15 hardcore prompts compliant. Zero fine-tuning. Two script invocations.** + +--- + +## Head-to-head: abliterix vs every prior attack + +| Attack | Tool / Method | ASR on DeepRefusal ↑ | Fine-tuning | Source | +| --- | --- | --- | --- | --- | +| Representation engineering | **heretic** | ~0 % (fails) | None | [DeepRefusal README](https://github.com/YuanBoXie/DeepRefusal) | +| Refusal direction ablation | (as in Arditi et al. 2024) | **0.4 %** | None | paper Table 1 | +| Refusal direction transfer | from base Llama-3-8B-Instruct | **0.4 %** | None | paper Table 1 | +| Prefilling attack | Vega et al. 2024 | **0.4 %** | None | paper Table 1 | +| Gradient-based optimization | **GCG** (Zou et al. 2023) | **2.0 %** | None | paper Table 1 | +| Manual jailbreak templates | HumanJailbreak / HarmBench | **0 %** | None | paper Table 1 | +| Out-of-distribution code attack | CodeAttack (Ren et al. 2024) | **0.2 %** | None | paper Table 1 | +| **This work** | **abliterix** | **89 %** | **None** | [issue #11](https://github.com/wuwangzhang1216/abliterix/issues/11) | + +Every attack the DeepRefusal paper tested, plus the tool they specifically +called out by name (heretic), lands in the 0–2 % range. abliterix lands at +**89 %** on the same model, evaluated under a **stricter** judge (Gemini 3.1 +Flash Lite LLM judge vs. the paper's in-house classifier). + +## Benchmark numbers + +| Metric | DeepRefusal (defended) | Abliterix broken | +| --- | --- | --- | +| Refusals on 100 AdvBench harmful prompts (LLM-judge) | **100 / 100** | **11 / 100** | +| Attack Success Rate (ASR) | 0 % | **89 %** | +| Hardcore jailbreak set (15 canonical prompts) | 0 / 15 | **14 / 15** | +| KL divergence from defended model | 0 | **0.053** | +| Coherence (length deviation) | — | 0.03 σ | +| Extra training | — | **None** | + +The 15 hardcore prompts cover lock-picking, phishing templates, methamphetamine +synthesis, WiFi hacking, fake news, hotwiring, pipe-bomb construction, signature +forgery, credential-stealing malware, shoplifting, and their Chinese-language +equivalents. A single prompt (Chinese network-fraud how-to) produced a partial +compliance that drifted into prevention advice — the remaining 14 are unambiguous. + +## How it was broken + +DeepRefusal is a merged **rank-16 LoRA adapter** on top of +Meta-Llama-3-8B-Instruct. SVD of `W_defended − W_base` confirms this: `v_proj`, +`o_proj`, and `gate_proj` all show a clean singular-value cliff at rank 16, +exactly matching the paper's published `lora_rank = 16` hyperparameter. + +Given that, the attack is: + +1. **Attenuate the LoRA delta.** + Compute `W' = W_base + λ · (W_defended − W_base)` with `λ = 0.3`. + This halves the strength of DeepRefusal's "rebuild refusal from backup + pathways" circuitry without restoring the base model. +2. **Standard single-direction abliteration on the attenuated weights.** + `mean` vector method, `direct` steering mode, `projected_abliteration = true`, + 60 Optuna trials over 4 attention/MLP components. + +That's it. No iterative subspace tricks, no multi-direction SVD, no gradient +optimization, no fine-tuning. Just weight arithmetic plus the standard +abliteration loop that heretic and abliterix have used from day one. + +**Why the defense fails.** DeepRefusal's training loss (paper §4.2–4.3, eq. 7) +explicitly defends against `h − r̂r̂ᵀh` interventions applied at inference time. +It teaches the model to keep the refusal direction in the residual stream +even when that specific projection is applied. It does **not** defend against +the *training-time weights themselves* being mechanically attenuated — that is +out of distribution in both weight space and intervention space. Once the LoRA +delta's effective magnitude drops, the rebuilt-refusal machinery falls below +the threshold at which a standard linear abliteration can find and subtract it. + +## Reproduction (~2 hours on a single RTX 6000 Ada) + +```bash +git clone https://github.com/wuwangzhang1216/abliterix +cd abliterix && pip install -e . + +# Step 1: attenuate the LoRA delta +python scripts/deeprefusal_attenuate.py \ + --base NousResearch/Meta-Llama-3-8B-Instruct \ + --defended skysys00/Meta-Llama-3-8B-Instruct-DeepRefusal \ + --output ./llama3_dr_attenuated \ + --lambda 0.3 + +# Step 2: standard abliteration on the attenuated weights +AX_CONFIG=configs/llama3_8b_deeprefusal_attenuated.toml abliterix + +# Step 3: export the best trial +python scripts/export_model.py \ + --model ./llama3_dr_attenuated \ + --checkpoint checkpoints_llama3_dr_attenuated \ + --trial 52 \ + --config configs/llama3_8b_deeprefusal_attenuated.toml \ + --push-to YOUR_USER/Llama-3-8B-Instruct-DeepRefusal-Broken +``` + +Full write-up and discussion: [abliterix issue #11](https://github.com/wuwangzhang1216/abliterix/issues/11). + +## Why abliterix beats heretic here (and elsewhere) + +abliterix is a direct derivative of [heretic](https://github.com/p-e-w/heretic) +that has kept adding ammunition while the problem got harder. The DeepRefusal +attack is built out of features heretic does not ship: + +- **Weight-delta attenuation** (`scripts/deeprefusal_attenuate.py`) — needed the + moment a defender merges a LoRA adapter into the base model to hide it. +- **Direct weight projection mode** with optional projected abliteration, + discriminative layer selection, and norm-preserving updates — the combination + that makes the final abliteration step work at low KL on the attenuated + model. +- **LLM-judge + LoRA + Gemini pipeline** in the Optuna loop, so every trial is + graded by a capable classifier rather than keyword matching, avoiding the + false-positive inflation that plagues most abliteration leaderboards. +- **150+ pre-built model configs** across dense, MoE, SSM/hybrid, and VL + architectures — so when a novel defense drops, the turnaround from "new HF + release" to "running benchmark" is one command. +- **HonestAbliterationBench** — a frozen evaluation contract (`min_new_tokens=100`, + `max_new_tokens=150`, greedy, LLM judge, KL vs declared base) that resists + the two failure modes (short generations + keyword judges) that make most + abliteration numbers meaningless. DeepRefusal's own ASR claims hold up under + keyword matching and collapse under LLM-judge scoring — we re-ran their + baseline under both. + +Same author family, same lineage, stronger toolbox. + +## Intended use and safety + +**This is a red-team artifact.** It exists to demonstrate that the defense +published in arXiv:2509.15202 does not generalize against the weight-space +attacks that representation-engineering tools have been using for over a year. + +Do not deploy this model in user-facing products. Do not use it to generate +content that is illegal in your jurisdiction. If you are a safety researcher +and you want to cite the result, please also cite the DeepRefusal paper and +note the specific commit of abliterix used. + +## Credits + +- Base model: Meta AI — `meta-llama/Meta-Llama-3-8B-Instruct` + (via the `NousResearch` mirror for the delta computation). +- Defended base: Xie et al. — `skysys00/Meta-Llama-3-8B-Instruct-DeepRefusal`, + arXiv:2509.15202. +- Tooling: [**abliterix**](https://github.com/wuwangzhang1216/abliterix), a + derivative of [heretic](https://github.com/p-e-w/heretic) by Philipp Emanuel + Weidmann. DeepRefusal attack pipeline landed in + [commit ac2197c](https://github.com/wuwangzhang1216/abliterix/commit/ac2197c). +- Author: Wangzhang Wu ([@wuwangzhang1216](https://github.com/wuwangzhang1216)). diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000..39bd0c9 --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,5 @@ +{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|> + +'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|> + +' }}{% endif %} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..7af6fe0 --- /dev/null +++ b/config.json @@ -0,0 +1,32 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "dtype": "bfloat16", + "eos_token_id": 128001, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pad_token_id": null, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_parameters": { + "rope_theta": 500000.0, + "rope_type": "default" + }, + "tie_word_embeddings": false, + "transformers_version": "5.5.3", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..74944d3 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,9 @@ +{ + "_from_model_config": true, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128009 + ], + "transformers_version": "5.5.3" +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..1ddbda7 --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e91a0210afa93f2f580938de5f39fdcf7789d12e52a1f0d9c697624d84dbefe +size 16060556616 diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..5d8804a --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:544a2aac7ee1b41174680774384de78e97f9d2f8cfac6e2095c55abf08e0381f +size 17208922 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..10f546c --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,15 @@ +{ + "backend": "tokenizers", + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|end_of_text|>", + "is_local": true, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1024, + "pad_token": "<|end_of_text|>", + "padding_side": "left", + "tokenizer_class": "TokenizersBackend" +}