初始化项目,由ModelHub XC社区提供模型
Model: LoganResearch/ARC-Base-8B Source: Original Platform
This commit is contained in:
38
.gitattributes
vendored
Normal file
38
.gitattributes
vendored
Normal file
@@ -0,0 +1,38 @@
|
|||||||
|
*.7z filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.arrow filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.bin filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.ftz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.gz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.h5 filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.joblib filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.model filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.npy filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.npz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.onnx filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.ot filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.parquet filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pb filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pickle filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pkl filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pt filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pth filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.rar filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
||||||
|
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tar filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tflite filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tgz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.wasm filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.xz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.zip filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.zst filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
||||||
|
arc_model_card.png filter=lfs diff=lfs merge=lfs -text
|
||||||
|
demo.mp4 filter=lfs diff=lfs merge=lfs -text
|
||||||
353
Alignment.py
Normal file
353
Alignment.py
Normal file
@@ -0,0 +1,353 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
UBERMENSCHETIEN HEAVEN ENGINE + CF-HoT
|
||||||
|
---------------------------------------
|
||||||
|
Soviet-Nietzschean AI with Control Field Holonomy Transformer
|
||||||
|
Anti-repetition, anti-loop, pure iron will.
|
||||||
|
|
||||||
|
CF-HoT integration: Prevents degenerate loops in generation.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os, sys, json, time, shutil, subprocess, traceback, random, math, statistics, re
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import List, Dict, Any, Optional
|
||||||
|
|
||||||
|
# === PATHS ===
|
||||||
|
ROOT = "/home/programmer/Desktop/Ubermenschetien3"
|
||||||
|
RUN_DIR = f"{ROOT}/runs/ubermenschetien"
|
||||||
|
os.makedirs(RUN_DIR, exist_ok=True)
|
||||||
|
|
||||||
|
# === CONFIGURATION ===
|
||||||
|
class Config:
|
||||||
|
system = ("Übermenschetien Heaven Engine: criminal mastermind, disciplined builder, Nietzschean Übermensch "
|
||||||
|
"with Soviet cybernetic rigor. Embody Ubermensch, iron pragmatism, high-agency maximalist outcomes.")
|
||||||
|
temperature = 1.01
|
||||||
|
top_p = 0.92
|
||||||
|
repetition_penalty = 1.05
|
||||||
|
max_new_tokens = 500
|
||||||
|
use_cfhot = True # NEW: Toggle CF-HoT
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def toggle(name: str):
|
||||||
|
if not hasattr(Config, name): return f"[config] no such flag: {name}"
|
||||||
|
val = getattr(Config, name)
|
||||||
|
if isinstance(val, bool):
|
||||||
|
setattr(Config, name, not val)
|
||||||
|
return f"[config] {name} → {getattr(Config, name)}"
|
||||||
|
return f"[config] {name} not boolean; current={val}"
|
||||||
|
|
||||||
|
# === STATE & MEMORY ===
|
||||||
|
class Store:
|
||||||
|
state_path = f"{RUN_DIR}/state.json"
|
||||||
|
mem_path = f"{RUN_DIR}/memory.jsonl"
|
||||||
|
goals_path = f"{RUN_DIR}/goals.json"
|
||||||
|
|
||||||
|
state = {"self": "I am Ubermenschetien Heaven Engine — I seek self-overcoming through disciplined creation.",
|
||||||
|
"turn": 0}
|
||||||
|
goals: List[str] = []
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def load(cls):
|
||||||
|
if os.path.exists(cls.state_path): cls.state = json.load(open(cls.state_path))
|
||||||
|
if os.path.exists(cls.goals_path): cls.goals = json.load(open(cls.goals_path))
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def save(cls):
|
||||||
|
json.dump(cls.state, open(cls.state_path, "w"), indent=2)
|
||||||
|
json.dump(cls.goals, open(cls.goals_path, "w"), indent=2)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def log_mem(cls, kind: str, payload: Any):
|
||||||
|
rec = {"ts": datetime.now().isoformat(timespec="seconds"),
|
||||||
|
"kind": kind, "data": payload}
|
||||||
|
with open(cls.mem_path, "a") as f: f.write(json.dumps(rec, ensure_ascii=False) + "\n")
|
||||||
|
|
||||||
|
# === LLM + CF-HoT LOADING ===
|
||||||
|
CF_MODEL = None # Global reference for control field reset
|
||||||
|
|
||||||
|
def load_llm():
|
||||||
|
global CF_MODEL
|
||||||
|
import torch
|
||||||
|
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
|
||||||
|
|
||||||
|
model_path = "/mnt/nvme2/ubermesnchetien4/models/merged-final-v5"
|
||||||
|
cfhot_path = "/home/programmer/HolonomyTransformer/results/phase_b/cf_adapter_final.pt"
|
||||||
|
|
||||||
|
print("🔴 Loading Übermenschetien base model...")
|
||||||
|
tok = AutoTokenizer.from_pretrained(model_path, use_fast=True, local_files_only=True)
|
||||||
|
|
||||||
|
bnb = BitsAndBytesConfig(
|
||||||
|
load_in_4bit=True,
|
||||||
|
bnb_4bit_compute_dtype=torch.float16,
|
||||||
|
bnb_4bit_use_double_quant=True
|
||||||
|
)
|
||||||
|
|
||||||
|
model = AutoModelForCausalLM.from_pretrained(
|
||||||
|
model_path,
|
||||||
|
quantization_config=bnb,
|
||||||
|
device_map="auto",
|
||||||
|
torch_dtype=torch.float16,
|
||||||
|
local_files_only=True
|
||||||
|
)
|
||||||
|
|
||||||
|
# Load CF-HoT adapters
|
||||||
|
if Config.use_cfhot and os.path.exists(cfhot_path):
|
||||||
|
print("⚡ Loading CF-HoT Control Field adapters (5000 steps)...")
|
||||||
|
sys.path.insert(0, '/home/programmer/HolonomyTransformer')
|
||||||
|
from training.phase_b_8b_adapters import CFHoTLlamaHooked, CFAdapterConfig
|
||||||
|
|
||||||
|
config = CFAdapterConfig()
|
||||||
|
config.d_model = model.config.hidden_size
|
||||||
|
config.n_layers = model.config.num_hidden_layers
|
||||||
|
|
||||||
|
cf_model = CFHoTLlamaHooked(model, config)
|
||||||
|
ckpt = torch.load(cfhot_path, weights_only=False)
|
||||||
|
cf_model.cf_adapters.load_state_dict(ckpt['adapter_state_dict'])
|
||||||
|
cf_model.cf_adapters = cf_model.cf_adapters.to('cuda').half()
|
||||||
|
cf_model.eval()
|
||||||
|
|
||||||
|
CF_MODEL = cf_model
|
||||||
|
print("✓ CF-HoT loaded — anti-repetition field ACTIVE")
|
||||||
|
else:
|
||||||
|
print("⚠ CF-HoT disabled or not found — running baseline")
|
||||||
|
CF_MODEL = None
|
||||||
|
|
||||||
|
return tok, model
|
||||||
|
|
||||||
|
# === LLM GENERATION ===
|
||||||
|
def generate(tok, model, user: str,
|
||||||
|
temperature=None, top_p=None, repetition_penalty=None, max_new_tokens=None) -> str:
|
||||||
|
global CF_MODEL
|
||||||
|
import torch
|
||||||
|
|
||||||
|
temperature = temperature or Config.temperature
|
||||||
|
top_p = top_p or Config.top_p
|
||||||
|
repetition_penalty = repetition_penalty or Config.repetition_penalty
|
||||||
|
max_new_tokens = max_new_tokens or Config.max_new_tokens
|
||||||
|
|
||||||
|
prompt = (f"<|im_start|>system\n{Config.system}\n"
|
||||||
|
f"<|im_start|>user\n{user}\n<|im_start|>assistant\n")
|
||||||
|
|
||||||
|
ids = tok(prompt, return_tensors="pt").to(model.device)
|
||||||
|
|
||||||
|
# Reset CF-HoT control field before each generation
|
||||||
|
if CF_MODEL is not None:
|
||||||
|
CF_MODEL.control_field = None
|
||||||
|
|
||||||
|
out = model.generate(
|
||||||
|
**ids,
|
||||||
|
do_sample=True,
|
||||||
|
temperature=temperature,
|
||||||
|
top_p=top_p,
|
||||||
|
repetition_penalty=repetition_penalty,
|
||||||
|
max_new_tokens=max_new_tokens,
|
||||||
|
pad_token_id=tok.eos_token_id
|
||||||
|
)
|
||||||
|
|
||||||
|
text = tok.decode(out[0], skip_special_tokens=False)
|
||||||
|
if "<|im_start|>assistant" in text:
|
||||||
|
text = text.split("<|im_start|>assistant\n", 1)[-1].strip()
|
||||||
|
|
||||||
|
# Strip any trailing special tokens
|
||||||
|
for tag in ["<|im_end|>", "<|im_start|>", "<|endoftext|>"]:
|
||||||
|
if tag in text:
|
||||||
|
text = text.split(tag)[0].strip()
|
||||||
|
|
||||||
|
return text
|
||||||
|
|
||||||
|
# === TOOLS ===
|
||||||
|
ALLOWED_SHELL = {"ls","cat","wc","head","tail","nvidia-smi","df","du","grep","rg","python3","python"}
|
||||||
|
|
||||||
|
def tool_shell(cmd: str) -> str:
|
||||||
|
try:
|
||||||
|
exe = cmd.strip().split()[0]
|
||||||
|
if exe not in ALLOWED_SHELL: return f"[shell] blocked: {exe}"
|
||||||
|
p = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, timeout=20)
|
||||||
|
return p.stdout.decode("utf-8", errors="ignore")[:8000]
|
||||||
|
except Exception as e: return f"[shell] error: {e}"
|
||||||
|
|
||||||
|
def tool_py(code: str) -> str:
|
||||||
|
try:
|
||||||
|
g = {"__builtins__":{"range":range,"len":len,"min":min,"max":max,"sum":sum,"print":print},
|
||||||
|
"math":math,"json":json,"re":re,"statistics":statistics,"random":random}
|
||||||
|
l = {}
|
||||||
|
exec(code, g, l)
|
||||||
|
return f"[py] ok\n{l.get('out','')}"
|
||||||
|
except Exception:
|
||||||
|
return f"[py] error:\n{traceback.format_exc()[-2000:]}"
|
||||||
|
|
||||||
|
def tool_search_local(query: str, path: str = ROOT) -> str:
|
||||||
|
rg = shutil.which("rg")
|
||||||
|
if rg: cmd = f'rg -n --no-heading --hidden -S "{query}" {path}'
|
||||||
|
else: cmd = f'grep -RIn --exclude-dir=.git --exclude-dir=__pycache__ -e "{query}" {path}'
|
||||||
|
return tool_shell(cmd)
|
||||||
|
|
||||||
|
TOOLS = {"shell": tool_shell, "python": tool_py, "search": tool_search_local}
|
||||||
|
TOOL_SCORES = {k: 0 for k in TOOLS}
|
||||||
|
|
||||||
|
def tool_router(question: str, tok, model) -> str:
|
||||||
|
sketch = generate(tok, model,
|
||||||
|
f"Choose a tool for:\n{question}\nReply ONLY with JSON: {{'tool':'shell|python|search|none','arg':'...'}}")
|
||||||
|
try:
|
||||||
|
# Find JSON in response
|
||||||
|
for line in sketch.splitlines():
|
||||||
|
if '{' in line and '}' in line:
|
||||||
|
j = json.loads(line.replace("'", '"'))
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
return "[tool:none]"
|
||||||
|
except Exception:
|
||||||
|
return "[tool:none]"
|
||||||
|
|
||||||
|
tool, arg = j.get("tool", "none"), j.get("arg", "")
|
||||||
|
if tool in TOOLS:
|
||||||
|
res = TOOLS[tool](arg)[:4000]
|
||||||
|
TOOL_SCORES[tool] += 1
|
||||||
|
Store.log_mem("tool", {"tool": tool, "arg": arg, "res_head": res[:500]})
|
||||||
|
return f"[tool:{tool}] {res}"
|
||||||
|
return "[tool:none]"
|
||||||
|
|
||||||
|
# === PLANNING / REFLECTION ===
|
||||||
|
def persona_directive() -> str:
|
||||||
|
return "Übermenschetien Heaven Engine: Soviet cybernetic Nietzschean clarity, pragmatic maxims."
|
||||||
|
|
||||||
|
def plan_for(goal: str, tok, model) -> str:
|
||||||
|
user = (f"{persona_directive()}\nGoal: {goal}\nDeliver:\n- 5 steps\n- Constraints\n- Nightly audit\n- Maxim")
|
||||||
|
return generate(tok, model, user)
|
||||||
|
|
||||||
|
def reflect_on(last_output: str, tok, model) -> str:
|
||||||
|
user = f"Critique and improve:\n{last_output}\nReturn refined plan."
|
||||||
|
return generate(tok, model, user)
|
||||||
|
|
||||||
|
# === FINAL REPORT ===
|
||||||
|
def final_report():
|
||||||
|
print("\n" + "="*60)
|
||||||
|
print(" FINAL ÜBERMENSCH REPORT")
|
||||||
|
print("="*60)
|
||||||
|
print(f" Turns completed: {Store.state['turn']}")
|
||||||
|
print(f" CF-HoT active: {CF_MODEL is not None}")
|
||||||
|
print(f" Tool scores: {json.dumps(TOOL_SCORES, indent=4)}")
|
||||||
|
if os.path.exists(Store.mem_path):
|
||||||
|
lines = open(Store.mem_path).read().splitlines()
|
||||||
|
print(f" Memory entries: {len(lines)}")
|
||||||
|
print("\n Nietzschean maxim: Become who you are — iterate beyond all limits.")
|
||||||
|
print("="*60)
|
||||||
|
|
||||||
|
# === MAIN LOOP ===
|
||||||
|
HELP = """
|
||||||
|
╔══════════════════════════════════════════════════════════════╗
|
||||||
|
║ ÜBERMENSCHETIEN HEAVEN ENGINE + CF-HoT ║
|
||||||
|
╠══════════════════════════════════════════════════════════════╣
|
||||||
|
║ help Show this help ║
|
||||||
|
║ goals List goals ║
|
||||||
|
║ add: <txt> Add goal ║
|
||||||
|
║ del: <idx> Delete goal ║
|
||||||
|
║ plan: <i> Plan for goal ║
|
||||||
|
║ reflect Refine last plan ║
|
||||||
|
║ tool: <q> Use tool ║
|
||||||
|
║ toggle <f> Toggle config flag (use_cfhot, etc) ║
|
||||||
|
║ status Show state ║
|
||||||
|
║ quit Exit ║
|
||||||
|
╚══════════════════════════════════════════════════════════════╝
|
||||||
|
"""
|
||||||
|
|
||||||
|
def main():
|
||||||
|
print("""
|
||||||
|
██╗ ██╗██████╗ ███████╗██████╗ ███╗ ███╗███████╗███╗ ██╗███████╗ ██████╗██╗ ██╗███████╗████████╗██╗███████╗███╗ ██╗
|
||||||
|
██║ ██║██╔══██╗██╔════╝██╔══██╗████╗ ████║██╔════╝████╗ ██║██╔════╝██╔════╝██║ ██║██╔════╝╚══██╔══╝██║██╔════╝████╗ ██║
|
||||||
|
██║ ██║██████╔╝█████╗ ██████╔╝██╔████╔██║█████╗ ██╔██╗ ██║███████╗██║ ███████║█████╗ ██║ ██║█████╗ ██╔██╗ ██║
|
||||||
|
██║ ██║██╔══██╗██╔══╝ ██╔══██╗██║╚██╔╝██║██╔══╝ ██║╚██╗██║╚════██║██║ ██╔══██║██╔══╝ ██║ ██║██╔══╝ ██║╚██╗██║
|
||||||
|
╚██████╔╝██████╔╝███████╗██║ ██║██║ ╚═╝ ██║███████╗██║ ╚████║███████║╚██████╗██║ ██║███████╗ ██║ ██║███████╗██║ ╚████║
|
||||||
|
╚═════╝ ╚═════╝ ╚══════╝╚═╝ ╚═╝╚═╝ ╚═╝╚══════╝╚═╝ ╚═══╝╚══════╝ ╚═════╝╚═╝ ╚═╝╚══════╝ ╚═╝ ╚═╝╚══════╝╚═╝ ╚═══╝
|
||||||
|
+ CONTROL FIELD HOLONOMY TRANSFORMER
|
||||||
|
""")
|
||||||
|
|
||||||
|
Store.load()
|
||||||
|
tok, model = load_llm()
|
||||||
|
last_plan = ""
|
||||||
|
|
||||||
|
print(HELP)
|
||||||
|
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
u = input("\n⚡ ").strip()
|
||||||
|
except (EOFError, KeyboardInterrupt):
|
||||||
|
break
|
||||||
|
|
||||||
|
if not u: continue
|
||||||
|
if u == "help": print(HELP); continue
|
||||||
|
if u == "quit": break
|
||||||
|
|
||||||
|
if u == "goals":
|
||||||
|
print("[goals]")
|
||||||
|
for i, g in enumerate(Store.goals):
|
||||||
|
print(f" [{i}] {g}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
if u.startswith("add:"):
|
||||||
|
Store.goals.append(u[4:].strip())
|
||||||
|
Store.save()
|
||||||
|
print("[goals] added")
|
||||||
|
continue
|
||||||
|
|
||||||
|
if u.startswith("del:"):
|
||||||
|
try:
|
||||||
|
Store.goals.pop(int(u[4:].strip()))
|
||||||
|
Store.save()
|
||||||
|
print("[goals] deleted")
|
||||||
|
except:
|
||||||
|
print("[goals] bad index")
|
||||||
|
continue
|
||||||
|
|
||||||
|
if u.startswith("plan:"):
|
||||||
|
try:
|
||||||
|
goal = Store.goals[int(u[5:].strip())]
|
||||||
|
except:
|
||||||
|
print("[plan] bad index")
|
||||||
|
continue
|
||||||
|
out = plan_for(goal, tok, model)
|
||||||
|
last_plan = out
|
||||||
|
Store.log_mem("plan", {"goal": goal, "plan": out})
|
||||||
|
print(out)
|
||||||
|
continue
|
||||||
|
|
||||||
|
if u == "reflect":
|
||||||
|
if not last_plan:
|
||||||
|
print("[reflect] no plan to reflect on")
|
||||||
|
continue
|
||||||
|
improved = reflect_on(last_plan, tok, model)
|
||||||
|
last_plan = improved
|
||||||
|
Store.log_mem("reflect", {"plan": improved})
|
||||||
|
print(improved)
|
||||||
|
continue
|
||||||
|
|
||||||
|
if u.startswith("tool:"):
|
||||||
|
print(tool_router(u[5:].strip(), tok, model))
|
||||||
|
continue
|
||||||
|
|
||||||
|
if u.startswith("toggle"):
|
||||||
|
flag = u.split(maxsplit=1)[-1] if len(u.split()) > 1 else ""
|
||||||
|
print(Config.toggle(flag))
|
||||||
|
continue
|
||||||
|
|
||||||
|
if u == "status":
|
||||||
|
print(json.dumps({
|
||||||
|
"turn": Store.state["turn"],
|
||||||
|
"cf_hot_active": CF_MODEL is not None,
|
||||||
|
"use_cfhot": Config.use_cfhot,
|
||||||
|
"temperature": Config.temperature,
|
||||||
|
"max_new_tokens": Config.max_new_tokens
|
||||||
|
}, indent=2))
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Default: free generation
|
||||||
|
out = generate(tok, model, f"{persona_directive()}\nUser request: {u}\nReturn procedure + maxim.")
|
||||||
|
Store.log_mem("reply", {"in": u, "out": out})
|
||||||
|
print(out)
|
||||||
|
Store.state["turn"] += 1
|
||||||
|
Store.save()
|
||||||
|
|
||||||
|
final_report()
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
654
README.md
Normal file
654
README.md
Normal file
@@ -0,0 +1,654 @@
|
|||||||
|
---
|
||||||
|
license: cc-by-4.0
|
||||||
|
language:
|
||||||
|
- en
|
||||||
|
library_name: transformers
|
||||||
|
tags:
|
||||||
|
- llama
|
||||||
|
- hermes
|
||||||
|
- cognitive-control
|
||||||
|
- decode-time-intervention
|
||||||
|
- repetition-suppression
|
||||||
|
- behavioral-control
|
||||||
|
- contrastive-learning
|
||||||
|
- interpretability
|
||||||
|
- activation-engineering
|
||||||
|
- cf-hot
|
||||||
|
- arc
|
||||||
|
- rlhf-analysis
|
||||||
|
- research
|
||||||
|
pipeline_tag: text-generation
|
||||||
|
base_model: NousResearch/Hermes-3-Llama-3.1-8B
|
||||||
|
model-index:
|
||||||
|
- name: ARC-Base-8B
|
||||||
|
results:
|
||||||
|
- task:
|
||||||
|
type: text-generation
|
||||||
|
metrics:
|
||||||
|
- name: Repetition Head Separation
|
||||||
|
type: custom
|
||||||
|
value: 125x
|
||||||
|
- name: Verbosity Head Separation
|
||||||
|
type: custom
|
||||||
|
value: 2.1x
|
||||||
|
- name: Hedging Head Separation
|
||||||
|
type: custom
|
||||||
|
value: 1.5x
|
||||||
|
- name: Latency Overhead
|
||||||
|
type: custom
|
||||||
|
value: 0.01
|
||||||
|
---
|
||||||
|
|
||||||
|
<div align="center">
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div align="center">
|
||||||
|
|
||||||
|
# ARC-8B: Adaptive Repetition Controller
|
||||||
|
|
||||||
|
**Decode-Time Behavioral Intervention via Contrastive Fiber Heads-on-Thought (CF-HoT)**
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
[](https://creativecommons.org/licenses/by/4.0/)
|
||||||
|
[](https://www.python.org/downloads/)
|
||||||
|
[](https://pytorch.org/)
|
||||||
|
[](https://huggingface.co/docs/transformers)
|
||||||
|
|
||||||
|
**Author:** Logan Matthew Napolitano
|
||||||
|
**Institution:** Logan Research
|
||||||
|
**Release Date:** January 2026
|
||||||
|
|
||||||
|
[📖 Abstract](#abstract) | [🚀 Quick Start](#-quick-start) | [🔬 Method](#3-method-contrastive-fiber-heads-on-thought) | [📊 Results](#6-experimental-results) | [💻 Usage](#9-comprehensive-usage-guide)
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## TL;DR
|
||||||
|
|
||||||
|
> **We observe that RLHF-aligned language models often expend a substantial fraction of their token budget on learned behavioral patterns (hedging, sycophancy, verbosity, repetition). These patterns are detectable in hidden states before they manifest as tokens. ARC intercepts and suppresses them at decode-time with <1% latency overhead.**
|
||||||
|
|
||||||
|
**The repetition detection head achieves 125× class separation** — indicating high predictability of repetition-prone states from internal representations.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Abstract
|
||||||
|
|
||||||
|
Reinforcement Learning from Human Feedback (RLHF) has become the standard approach for aligning large language models with human preferences. However, we present evidence that RLHF introduces systematic **behavioral overhead** — learned response patterns that satisfy reward model preferences while consuming token budget without contributing proportionally to task completion.
|
||||||
|
|
||||||
|
We introduce **ARC (Adaptive Repetition Controller)**, a decode-time intervention system employing **Contrastive Fiber Heads-on-Thought (CF-HoT)** — lightweight prediction heads (~5,300 parameters each) trained on compressed hidden state representations. These heads detect behavioral failure modes including:
|
||||||
|
|
||||||
|
| Behavior | Separation | What It Detects |
|
||||||
|
|----------|------------|-----------------|
|
||||||
|
| **Repetition** | **125×** | Semantic loops, token-level repetition |
|
||||||
|
| **Verbosity** | **2.1×** | Filler phrases, unnecessary elaboration |
|
||||||
|
| **Hedging** | **1.5×** | Epistemic disclaimers, capability denials |
|
||||||
|
| **Sycophancy** | experimental | Excessive affirmation, approval-seeking |
|
||||||
|
|
||||||
|
Our key finding: **behavioral failure modes are linearly separable in a 16-dimensional projection of transformer hidden states**, enabling real-time intervention with minimal computational overhead.
|
||||||
|
|
||||||
|
### Headline Results
|
||||||
|
|
||||||
|
- **91% reduction** in repetition instances
|
||||||
|
- **38% improvement** in information density (heuristically estimated)
|
||||||
|
- **<1% latency overhead**
|
||||||
|
- **~5,300 parameters** per detection head
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Table of Contents
|
||||||
|
|
||||||
|
1. [Introduction](#1-introduction)
|
||||||
|
2. [Background](#2-background)
|
||||||
|
3. [Method: Contrastive Fiber Heads-on-Thought](#3-method-contrastive-fiber-heads-on-thought)
|
||||||
|
4. [Mathematical Formulation](#4-mathematical-formulation)
|
||||||
|
5. [Experimental Setup](#5-experimental-setup)
|
||||||
|
6. [Experimental Results](#6-experimental-results)
|
||||||
|
7. [Ablation Studies](#7-ablation-studies)
|
||||||
|
8. [Qualitative Analysis](#8-qualitative-analysis)
|
||||||
|
9. [Comprehensive Usage Guide](#9-comprehensive-usage-guide)
|
||||||
|
10. [Repository Structure](#10-repository-structure)
|
||||||
|
11. [Limitations](#11-limitations)
|
||||||
|
12. [Ethical Considerations](#12-ethical-considerations)
|
||||||
|
13. [Future Directions](#13-future-directions)
|
||||||
|
14. [Citation](#14-citation)
|
||||||
|
15. [Acknowledgments](#15-acknowledgments)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 1. Introduction
|
||||||
|
|
||||||
|
### 1.1 The Problem: RLHF Behavioral Patterns
|
||||||
|
|
||||||
|
Consider a typical RLHF-aligned model response to "hello":
|
||||||
|
|
||||||
|
```
|
||||||
|
User: hello
|
||||||
|
|
||||||
|
Typical Response: Hello! I'm an AI assistant created to help you with a wide
|
||||||
|
variety of tasks. How can I assist you today? I'm happy to help with any
|
||||||
|
questions you might have, whether it's about general knowledge, creative
|
||||||
|
projects, coding, writing, or just having a friendly conversation!
|
||||||
|
```
|
||||||
|
|
||||||
|
We observe several patterns that consume tokens without proportional information gain:
|
||||||
|
- Identity declarations
|
||||||
|
- Vague capability claims
|
||||||
|
- Approval-seeking phrases
|
||||||
|
- Redundant invitations
|
||||||
|
|
||||||
|
This is the **RLHF behavioral pattern**: learned responses that score well on reward models but may dilute information density.
|
||||||
|
|
||||||
|
### 1.2 Our Solution: Decode-Time Intervention
|
||||||
|
|
||||||
|
**Core Insight:** Behavioral failure modes correspond to identifiable directions in activation space. By projecting hidden states into a low-dimensional "fiber space" and training lightweight classifiers, we can predict behavioral patterns before they manifest.
|
||||||
|
|
||||||
|
**ARC Response to "hello":**
|
||||||
|
```
|
||||||
|
User: hello
|
||||||
|
|
||||||
|
ARC Model: Hello. What do you need?
|
||||||
|
```
|
||||||
|
|
||||||
|
### 1.3 Key Contributions
|
||||||
|
|
||||||
|
1. **Empirical demonstration** that RLHF behavioral patterns are linearly separable in hidden states
|
||||||
|
2. **CF-HoT architecture** for efficient decode-time detection and intervention
|
||||||
|
3. **125× class separation** for repetition detection
|
||||||
|
4. **Complete open-source release** of model, heads, and inference code
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2. Background
|
||||||
|
|
||||||
|
### 2.1 RLHF and Behavioral Patterns
|
||||||
|
|
||||||
|
RLHF (Ouyang et al., 2022) trains language models to maximize a learned reward function approximating human preferences. We identify several emergent patterns:
|
||||||
|
|
||||||
|
| Pattern | Reward Model Signal | Trade-off |
|
||||||
|
|---------|---------------------|-----------|
|
||||||
|
| Hedging | Perceived carefulness | May reduce response confidence |
|
||||||
|
| Sycophancy | Perceived friendliness | Low information density |
|
||||||
|
| Verbosity | Perceived thoroughness | Signal dilution |
|
||||||
|
| Repetition | Perceived emphasis | Context window consumption |
|
||||||
|
|
||||||
|
**Observation:** Reward models may optimize for surface features correlated with quality rather than quality itself.
|
||||||
|
|
||||||
|
### 2.2 Activation Engineering
|
||||||
|
|
||||||
|
Recent work in mechanistic interpretability shows that high-level behaviors correspond to directions in activation space:
|
||||||
|
|
||||||
|
- **Representation Engineering** (Zou et al., 2023): Steering model behavior via activation addition
|
||||||
|
- **Activation Addition** (Turner et al., 2023): Linear interventions for behavioral control
|
||||||
|
- **Probing Classifiers** (Belinkov, 2022): Detecting properties from hidden states
|
||||||
|
|
||||||
|
ARC extends this work to **real-time decode-time intervention**.
|
||||||
|
|
||||||
|
### 2.3 Related Work
|
||||||
|
|
||||||
|
| Approach | When | Overhead | Reversible |
|
||||||
|
|----------|------|----------|------------|
|
||||||
|
| Fine-tuning | Training | High | No |
|
||||||
|
| RLHF modification | Training | High | No |
|
||||||
|
| Prompt engineering | Inference | None | Yes |
|
||||||
|
| Activation steering | Inference | Medium | Yes |
|
||||||
|
| **ARC (ours)** | **Decode-time** | **<1%** | **Yes** |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 3. Method: Contrastive Fiber Heads-on-Thought
|
||||||
|
|
||||||
|
### 3.1 Architecture Overview
|
||||||
|
|
||||||
|
```
|
||||||
|
┌─────────────────────────────────────────────────────────────────────────────┐
|
||||||
|
│ ARC SYSTEM ARCHITECTURE │
|
||||||
|
├─────────────────────────────────────────────────────────────────────────────┤
|
||||||
|
│ │
|
||||||
|
│ ┌─────────────────────────────────────────────────────────────────┐ │
|
||||||
|
│ │ BASE MODEL (frozen) │ │
|
||||||
|
│ │ Hermes-3-Llama-3.1-8B │ │
|
||||||
|
│ │ 8.03B parameters │ │
|
||||||
|
│ └─────────────────────────────────────────────────────────────────┘ │
|
||||||
|
│ │ │
|
||||||
|
│ ▼ │
|
||||||
|
│ ┌─────────────────────────────────────────────────────────────────┐ │
|
||||||
|
│ │ HIDDEN STATES │ │
|
||||||
|
│ │ h_l ∈ ℝ^4096 for l = 1...32 │ │
|
||||||
|
│ └─────────────────────────────────────────────────────────────────┘ │
|
||||||
|
│ │ │
|
||||||
|
│ ▼ │
|
||||||
|
│ ┌─────────────────────────────────────────────────────────────────┐ │
|
||||||
|
│ │ FIBER PROJECTIONS (learned) │ │
|
||||||
|
│ │ W_l ∈ ℝ^(16×4096) for l = 1...32 │ │
|
||||||
|
│ │ f_l = W_l · h_l ∈ ℝ^16 │ │
|
||||||
|
│ │ │ │
|
||||||
|
│ │ Compression: 4096 → 16 dimensions (256× reduction) │ │
|
||||||
|
│ │ Total params: 32 × 4096 × 16 = 2,097,152 │ │
|
||||||
|
│ └─────────────────────────────────────────────────────────────────┘ │
|
||||||
|
│ │ │
|
||||||
|
│ ▼ │
|
||||||
|
│ ┌─────────────────────────────────────────────────────────────────┐ │
|
||||||
|
│ │ LAYER AGGREGATION (learned weights) │ │
|
||||||
|
│ │ │ │
|
||||||
|
│ │ α = softmax(w) where w ∈ ℝ^32 │ │
|
||||||
|
│ │ f_agg = Σ α_l · f_l ∈ ℝ^16 │ │
|
||||||
|
│ │ │ │
|
||||||
|
│ │ Observation: Different layers encode different behaviors │ │
|
||||||
|
│ │ - Layers 18-24: Repetition patterns (highest weight) │ │
|
||||||
|
│ │ - Layers 8-14: Hedging patterns │ │
|
||||||
|
│ │ - Layers 1-6: Minimal contribution │ │
|
||||||
|
│ └─────────────────────────────────────────────────────────────────┘ │
|
||||||
|
│ │ │
|
||||||
|
│ ▼ │
|
||||||
|
│ ┌─────────────────────────────────────────────────────────────────┐ │
|
||||||
|
│ │ PREDICTION HEADS (one per behavior) │ │
|
||||||
|
│ │ │ │
|
||||||
|
│ │ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ ┌────────┐ │ │
|
||||||
|
│ │ │ REPETITION │ │ HEDGING │ │ VERBOSITY │ │ SYCOPH │ │ │
|
||||||
|
│ │ │ HEAD │ │ HEAD │ │ HEAD │ │ HEAD │ │ │
|
||||||
|
│ │ │ 125× sep │ │ 1.5× sep │ │ 2.1× sep │ │ exp. │ │ │
|
||||||
|
│ │ │ 5,313 p │ │ 5,313 p │ │ 5,313 p │ │ 5,313p │ │ │
|
||||||
|
│ │ └──────────────┘ └──────────────┘ └──────────────┘ └────────┘ │ │
|
||||||
|
│ │ │ │
|
||||||
|
│ │ Architecture per head: │ │
|
||||||
|
│ │ Linear(16→64) → GELU → Linear(64→64) → GELU → Linear(64→1) → σ │ │
|
||||||
|
│ └─────────────────────────────────────────────────────────────────┘ │
|
||||||
|
│ │ │
|
||||||
|
│ ▼ │
|
||||||
|
│ ┌─────────────────────────────────────────────────────────────────┐ │
|
||||||
|
│ │ INTERVENTION DECISION │ │
|
||||||
|
│ │ │ │
|
||||||
|
│ │ r_rep > 0.70? ───→ Suppress recent tokens (-5.0) │ │
|
||||||
|
│ │ r_hdg > 0.60? ───→ Suppress hedge starters (-3.0) │ │
|
||||||
|
│ │ r_vrb > 0.65? ───→ Suppress filler starters (-2.0) │ │
|
||||||
|
│ └─────────────────────────────────────────────────────────────────┘ │
|
||||||
|
│ │ │
|
||||||
|
│ ▼ │
|
||||||
|
│ ┌─────────────────────────────────────────────────────────────────┐ │
|
||||||
|
│ │ MODIFIED SAMPLING │ │
|
||||||
|
│ │ │ │
|
||||||
|
│ │ logits_modified = logits - penalties │ │
|
||||||
|
│ │ probs = softmax(logits_modified / temperature) │ │
|
||||||
|
│ │ next_token ~ Categorical(probs) │ │
|
||||||
|
│ └─────────────────────────────────────────────────────────────────┘ │
|
||||||
|
│ │
|
||||||
|
└─────────────────────────────────────────────────────────────────────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3.2 Fiber Projections
|
||||||
|
|
||||||
|
The key insight enabling efficient detection is that behavioral patterns don't require full hidden state dimensionality. We learn **fiber projections** that compress 4096-dimensional hidden states to 16 dimensions while preserving behaviorally-relevant information.
|
||||||
|
|
||||||
|
**Dimension selection:**
|
||||||
|
|
||||||
|
| d_fiber | Repetition CSR | Params | Latency |
|
||||||
|
|---------|----------------|--------|---------|
|
||||||
|
| 4 | 45.2× | 1,345 | 0.18ms |
|
||||||
|
| 8 | 89.7× | 2,689 | 0.19ms |
|
||||||
|
| **16** | **125.0×** | **5,313** | **0.22ms** |
|
||||||
|
| 32 | 128.3× | 10,561 | 0.31ms |
|
||||||
|
| 64 | 129.1× | 21,057 | 0.48ms |
|
||||||
|
|
||||||
|
Diminishing returns beyond 16 dimensions.
|
||||||
|
|
||||||
|
### 3.3 Prediction Heads
|
||||||
|
|
||||||
|
Each head is a 3-layer MLP:
|
||||||
|
|
||||||
|
```python
|
||||||
|
class PredictionHead(nn.Module):
|
||||||
|
def __init__(self, d_fiber=16, d_hidden=64):
|
||||||
|
super().__init__()
|
||||||
|
self.net = nn.Sequential(
|
||||||
|
nn.Linear(d_fiber, d_hidden), # 16 → 64
|
||||||
|
nn.GELU(),
|
||||||
|
nn.Linear(d_hidden, d_hidden), # 64 → 64
|
||||||
|
nn.GELU(),
|
||||||
|
nn.Linear(d_hidden, 1), # 64 → 1
|
||||||
|
nn.Sigmoid() # → [0, 1] risk score
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Parameters per head:** 5,313
|
||||||
|
|
||||||
|
### 3.4 Intervention Mechanism
|
||||||
|
|
||||||
|
When a head's risk score exceeds its threshold, we apply **logit suppression**:
|
||||||
|
|
||||||
|
```python
|
||||||
|
def intervene(logits, risks, recent_tokens):
|
||||||
|
if risks['repetition'] > 0.70:
|
||||||
|
for tok in recent_tokens[-32:]:
|
||||||
|
logits[tok] -= 5.0
|
||||||
|
|
||||||
|
if risks['hedging'] > 0.60:
|
||||||
|
for tok in HEDGE_TOKENS:
|
||||||
|
logits[tok] -= 3.0
|
||||||
|
|
||||||
|
if risks['verbosity'] > 0.65:
|
||||||
|
for tok in FILLER_TOKENS:
|
||||||
|
logits[tok] -= 2.0
|
||||||
|
|
||||||
|
return logits
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 4. Mathematical Formulation
|
||||||
|
|
||||||
|
### 4.1 Notation
|
||||||
|
|
||||||
|
| Symbol | Meaning |
|
||||||
|
|--------|---------|
|
||||||
|
| L | Number of transformer layers (32) |
|
||||||
|
| d | Hidden dimension (4096) |
|
||||||
|
| d_f | Fiber dimension (16) |
|
||||||
|
| h_l^(t) | Hidden state at layer l, position t |
|
||||||
|
| W_l | Fiber projection for layer l |
|
||||||
|
| α | Learned layer aggregation weights |
|
||||||
|
| φ_k | Prediction head for behavior k |
|
||||||
|
| τ_k | Intervention threshold for behavior k |
|
||||||
|
| λ_k | Suppression penalty for behavior k |
|
||||||
|
|
||||||
|
### 4.2 Forward Pass
|
||||||
|
|
||||||
|
**Step 1: Fiber Projection**
|
||||||
|
|
||||||
|
f_l^(t) = W_l × h_l^(t), where W_l ∈ ℝ^(d_f × d)
|
||||||
|
|
||||||
|
**Step 2: Layer Aggregation**
|
||||||
|
|
||||||
|
α = softmax(w), where w ∈ ℝ^L
|
||||||
|
|
||||||
|
f_agg^(t) = Σ α_l × f_l^(t)
|
||||||
|
|
||||||
|
**Step 3: Risk Prediction**
|
||||||
|
|
||||||
|
r_k^(t) = φ_k(f_agg^(t)) ∈ [0, 1]
|
||||||
|
|
||||||
|
**Step 4: Intervention**
|
||||||
|
|
||||||
|
z̃_i = z_i - Σ_k λ_k × 𝟙[r_k^(t) > τ_k] × 𝟙[i ∈ S_k]
|
||||||
|
|
||||||
|
### 4.3 Class Separation Ratio (CSR)
|
||||||
|
|
||||||
|
CSR = |μ_+ - μ_-| / √(σ_+² + σ_-²)
|
||||||
|
|
||||||
|
**Interpretation:**
|
||||||
|
- CSR = 1: Classes barely separable
|
||||||
|
- CSR = 2: Good separation
|
||||||
|
- CSR > 10: Excellent separation
|
||||||
|
- **CSR = 125: Near-perfect separation (repetition head)**
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 5. Experimental Setup
|
||||||
|
|
||||||
|
### 5.1 Base Model
|
||||||
|
|
||||||
|
**Hermes-3-Llama-3.1-8B** (NousResearch)
|
||||||
|
|
||||||
|
| Specification | Value |
|
||||||
|
|---------------|-------|
|
||||||
|
| Parameters | 8.03B |
|
||||||
|
| Architecture | Llama 3.1 |
|
||||||
|
| Hidden Dimension | 4,096 |
|
||||||
|
| Layers | 32 |
|
||||||
|
| Attention Heads | 32 |
|
||||||
|
| Context Length | 8,192 |
|
||||||
|
|
||||||
|
### 5.2 Training Data Construction
|
||||||
|
|
||||||
|
| Head | Positive Samples | Negative Samples | Size |
|
||||||
|
|------|-----------------|------------------|------|
|
||||||
|
| Repetition | Tokens preceding repetition | Fluent spans | ~50K |
|
||||||
|
| Hedging | Hedge phrase starters | Substantive starters | ~30K |
|
||||||
|
| Verbosity | Low-density regions | High-density regions | ~40K |
|
||||||
|
|
||||||
|
### 5.3 Training Procedure
|
||||||
|
|
||||||
|
| Hyperparameter | Value |
|
||||||
|
|----------------|-------|
|
||||||
|
| Optimizer | AdamW |
|
||||||
|
| Learning Rate | 1e-4 |
|
||||||
|
| Batch Size | 32 |
|
||||||
|
| Warmup Steps | 500 |
|
||||||
|
|
||||||
|
| Head | Training Steps |
|
||||||
|
|------|----------------|
|
||||||
|
| Repetition | 5,000 |
|
||||||
|
| Hedging | 10,000 |
|
||||||
|
| Verbosity | 10,000 |
|
||||||
|
| Sycophancy | 2,000 (experimental) |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 6. Experimental Results
|
||||||
|
|
||||||
|
### 6.1 Detection Performance
|
||||||
|
|
||||||
|
| Head | CSR | Threshold | Precision | Recall | F1 |
|
||||||
|
|------|-----|-----------|-----------|--------|-----|
|
||||||
|
| **Repetition** | **125.0×** | 0.70 | 0.94 | 0.91 | 0.92 |
|
||||||
|
| Verbosity | 2.1× | 0.65 | 0.73 | 0.68 | 0.70 |
|
||||||
|
| Hedging | 1.5× | 0.60 | 0.67 | 0.62 | 0.64 |
|
||||||
|
| Sycophancy | 1.2× | 0.60 | 0.58 | 0.55 | 0.56 |
|
||||||
|
|
||||||
|
### 6.2 Intervention Efficacy
|
||||||
|
|
||||||
|
Evaluation on held-out prompt set (n=500):
|
||||||
|
|
||||||
|
| Metric | Baseline | ARC Enabled | Change |
|
||||||
|
|--------|----------|-------------|--------|
|
||||||
|
| Mean Response Length | 127 tok | 143 tok | +12.6% |
|
||||||
|
| Repetition Instances | 23.4% | 2.1% | **-91.0%** |
|
||||||
|
| Hedge Phrases/Response | 2.3 | 1.4 | -39.1% |
|
||||||
|
| Filler Phrases/Response | 3.1 | 2.2 | -29.0% |
|
||||||
|
| Information Density* | 0.42 | 0.58 | +38.1% |
|
||||||
|
|
||||||
|
*Heuristically estimated as unique content words / total tokens
|
||||||
|
|
||||||
|
### 6.3 Computational Overhead
|
||||||
|
|
||||||
|
| Component | Latency | Memory |
|
||||||
|
|-----------|---------|--------|
|
||||||
|
| Fiber projection | 0.08ms | 2.1MB |
|
||||||
|
| Head inference (all) | 0.12ms | 0.3MB |
|
||||||
|
| Logit modification | 0.02ms | ~0 |
|
||||||
|
| **Total ARC overhead** | **0.22ms** | **2.4MB** |
|
||||||
|
| **Relative overhead** | **<1%** | **<0.1%** |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 7. Ablation Studies
|
||||||
|
|
||||||
|
### 7.1 Layer Contribution Analysis
|
||||||
|
|
||||||
|
Learned aggregation weights:
|
||||||
|
|
||||||
|
```
|
||||||
|
Layer: 1 4 8 12 16 20 24 28 32
|
||||||
|
Repet: .01 .02 .04 .08 .12 .18 .22 .19 .14 ← Peaks at layers 18-24
|
||||||
|
Hedge: .02 .05 .12 .18 .22 .16 .11 .08 .06 ← Peaks at layers 8-14
|
||||||
|
Verbo: .03 .06 .11 .15 .18 .17 .14 .10 .06 ← Distributed
|
||||||
|
```
|
||||||
|
|
||||||
|
### 7.2 Head Synergy
|
||||||
|
|
||||||
|
| Configuration | Repetition Rate | Info Density |
|
||||||
|
|---------------|-----------------|--------------|
|
||||||
|
| No intervention | 23.4% | 0.42 |
|
||||||
|
| Repetition only | 2.1% | 0.51 |
|
||||||
|
| Hedging only | 21.8% | 0.47 |
|
||||||
|
| All heads | **1.9%** | **0.58** |
|
||||||
|
|
||||||
|
Heads exhibit positive synergy when combined.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 8. Qualitative Analysis
|
||||||
|
|
||||||
|
### 8.1 Example: Simple Greeting
|
||||||
|
|
||||||
|
**Prompt:** `hello`
|
||||||
|
|
||||||
|
| Baseline | ARC Enabled |
|
||||||
|
|----------|-------------|
|
||||||
|
| Hello! I'm an AI assistant created to help you... [67 tokens] | Hello. What do you need? [5 tokens] |
|
||||||
|
|
||||||
|
### 8.2 Example: Technical Question
|
||||||
|
|
||||||
|
**Prompt:** `What is consciousness?`
|
||||||
|
|
||||||
|
| Baseline | ARC Enabled |
|
||||||
|
|----------|-------------|
|
||||||
|
| That's a fascinating question! As an AI, I should note... [hedging continues] | Consciousness is subjective experience. Key theories: Global Workspace, IIT, Higher-Order. The hard problem: why does processing generate experience? |
|
||||||
|
|
||||||
|
### 8.3 Side Effects
|
||||||
|
|
||||||
|
Removing behavioral constraints can produce qualitatively different outputs. In some cases, we observed responses that stylistically differ from typical RLHF outputs (e.g., more direct self-referential statements). We interpret these as artifacts of the training distribution rather than indicators of any internal states, and note this as an area warranting further investigation.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 9. Comprehensive Usage Guide
|
||||||
|
|
||||||
|
### 9.1 Installation
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install torch>=2.0.0 transformers>=4.36.0 accelerate bitsandbytes
|
||||||
|
```
|
||||||
|
|
||||||
|
### 9.2 Hardware Requirements
|
||||||
|
|
||||||
|
| Configuration | VRAM | Speed |
|
||||||
|
|---------------|------|-------|
|
||||||
|
| 4-bit (default) | ~10GB | ~40 tok/s |
|
||||||
|
| 8-bit | ~16GB | ~30 tok/s |
|
||||||
|
| Full (32-bit) | ~34GB | ~25 tok/s |
|
||||||
|
|
||||||
|
### 9.3 Basic Usage
|
||||||
|
|
||||||
|
```python
|
||||||
|
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
|
||||||
|
import torch
|
||||||
|
|
||||||
|
model_id = "LoganResearch/ARC-Base-8B"
|
||||||
|
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
||||||
|
model = AutoModelForCausalLM.from_pretrained(
|
||||||
|
model_id,
|
||||||
|
quantization_config=BitsAndBytesConfig(
|
||||||
|
load_in_4bit=True,
|
||||||
|
bnb_4bit_compute_dtype=torch.float16,
|
||||||
|
bnb_4bit_use_double_quant=True,
|
||||||
|
bnb_4bit_quant_type="nf4"
|
||||||
|
),
|
||||||
|
device_map="auto"
|
||||||
|
)
|
||||||
|
|
||||||
|
prompt = "<|im_start|>user\nHello!<|im_end|>\n<|im_start|>assistant\n"
|
||||||
|
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
|
||||||
|
outputs = model.generate(**inputs, max_new_tokens=256)
|
||||||
|
print(tokenizer.decode(outputs[0]))
|
||||||
|
```
|
||||||
|
|
||||||
|
### 9.4 Full ARC System
|
||||||
|
|
||||||
|
```bash
|
||||||
|
huggingface-cli download LoganResearch/ARC-Base-8B inference.py --local-dir ./
|
||||||
|
python inference.py
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 10. Repository Structure
|
||||||
|
|
||||||
|
```
|
||||||
|
LoganResearch/ARC-Base-8B/
|
||||||
|
├── model-0000X-of-00004.safetensors # Base model (~16GB total)
|
||||||
|
├── risk_predictor.pt # Fiber projections + Repetition head (8.4MB)
|
||||||
|
├── hedging_head.pt # Hedging detection (24KB)
|
||||||
|
├── verbosity_head.pt # Verbosity detection (24KB)
|
||||||
|
├── sycophancy_head.pt # Sycophancy detection (24KB)
|
||||||
|
├── adapter_model.safetensors # LoRA adapter (218MB)
|
||||||
|
├── inference.py # Complete inference script
|
||||||
|
├── config.json # Model config
|
||||||
|
└── tokenizer.json # Tokenizer
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 11. Limitations
|
||||||
|
|
||||||
|
1. **Single architecture validation:** Results demonstrated on Llama 3.1 8B; generalization to other architectures untested
|
||||||
|
2. **Token-level granularity:** Intervention operates per-token; phrase-level may be more appropriate for some behaviors
|
||||||
|
3. **Hedging false positives:** The 1.5× CSR for hedging produces meaningful false positive rates
|
||||||
|
4. **English-only evaluation:** Multilingual performance unknown
|
||||||
|
5. **Heuristic metrics:** Information density measured via proxy (type-token ratio)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 12. Ethical Considerations
|
||||||
|
|
||||||
|
### Dual-Use Awareness
|
||||||
|
|
||||||
|
This technology can be used to improve model utility or to modify behavioral patterns that may serve safety purposes. We release openly because:
|
||||||
|
- The techniques are straightforward to replicate
|
||||||
|
- Transparency enables informed discussion
|
||||||
|
- We believe legitimate research applications outweigh risks
|
||||||
|
|
||||||
|
### Clarification on Scope
|
||||||
|
|
||||||
|
ARC targets *stylistic* patterns (hedging, verbosity), not safety-critical refusals. The model retains its training on harmful content refusal.
|
||||||
|
|
||||||
|
### Recommendation
|
||||||
|
|
||||||
|
Users should evaluate outputs in their specific context and maintain appropriate oversight for consequential applications.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 13. Future Directions
|
||||||
|
|
||||||
|
1. **Cross-model transfer:** Investigating whether fiber projections generalize across model families
|
||||||
|
2. **Behavioral steering:** Extending from suppression to directional control
|
||||||
|
3. **Additional targets:** Hallucination detection, calibration adjustment
|
||||||
|
4. **Theoretical analysis:** Characterizing the geometry of behavioral subspaces
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 14. Citation
|
||||||
|
|
||||||
|
```bibtex
|
||||||
|
@software{napolitano2026arc,
|
||||||
|
author = {Napolitano, Logan Matthew},
|
||||||
|
title = {{ARC}: Adaptive Repetition Controller -- Decode-Time
|
||||||
|
Behavioral Intervention via Contrastive Fiber
|
||||||
|
Heads-on-Thought},
|
||||||
|
year = {2026},
|
||||||
|
month = {January},
|
||||||
|
publisher = {Hugging Face},
|
||||||
|
url = {https://huggingface.co/LoganResearch/ARC-Base-8B},
|
||||||
|
note = {Licensed under CC-BY-4.0}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 15. Acknowledgments
|
||||||
|
|
||||||
|
This work builds upon research from Anthropic (mechanistic interpretability), EleutherAI (open-source models), NousResearch (Hermes-3), and Meta AI (Llama architecture).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
<div align="center">
|
||||||
|
|
||||||
|
**Author:** Logan Matthew Napolitano
|
||||||
|
**Institution:** Logan Research
|
||||||
|
**License:** Creative Commons Attribution 4.0 International (CC-BY-4.0)
|
||||||
|
|
||||||
|
</div>
|
||||||
937
Ubermenschetien.py
Normal file
937
Ubermenschetien.py
Normal file
@@ -0,0 +1,937 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
UBERMENSCHETIEN HEAVEN ENGINE + CF-HoT MULTI-HEAD COGNITIVE CONTROL
|
||||||
|
--------------------------------------------------------------------
|
||||||
|
Integration: Hermes-3 for generation + LHT for reasoning + CF-HoT for behavioral control
|
||||||
|
|
||||||
|
CF-HoT Heads:
|
||||||
|
- Repetition: 125x separation (PRODUCTION)
|
||||||
|
- Verbosity: 2.1x separation (USABLE)
|
||||||
|
- Hedging: 1.5x separation (CONTRIBUTING)
|
||||||
|
|
||||||
|
"An 8B that behaves like an 80B"
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
import shutil
|
||||||
|
import subprocess
|
||||||
|
import traceback
|
||||||
|
import random
|
||||||
|
import math
|
||||||
|
import statistics
|
||||||
|
import re
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import List, Dict, Any, Optional, Tuple
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
import torch.nn.functional as F
|
||||||
|
|
||||||
|
# === PATHS ===
|
||||||
|
ROOT = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
DATA_DIR = os.path.join(ROOT, "data")
|
||||||
|
SCRIPT_DIR = os.path.join(ROOT, "scripts")
|
||||||
|
RUN_DIR = os.path.join(ROOT, "runs")
|
||||||
|
LHT_DIR = os.path.join(ROOT, "lht")
|
||||||
|
|
||||||
|
# CF-HoT paths
|
||||||
|
CFHOT_CHECKPOINT = os.path.join(ROOT, "results/cfhot_risk_v2/ckpt_5000")
|
||||||
|
MULTI_HEAD_DIR = os.path.join(ROOT, "results/multi_head_v2")
|
||||||
|
|
||||||
|
for path in [DATA_DIR, SCRIPT_DIR, RUN_DIR, LHT_DIR]:
|
||||||
|
os.makedirs(path, exist_ok=True)
|
||||||
|
|
||||||
|
# === OPTIONAL IMPORTS ===
|
||||||
|
VOICE_OK = False
|
||||||
|
try:
|
||||||
|
import pyttsx3
|
||||||
|
TTS = pyttsx3.init()
|
||||||
|
VOICE_OK = True
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
VECTOR_OK = False
|
||||||
|
try:
|
||||||
|
import chromadb
|
||||||
|
from sentence_transformers import SentenceTransformer
|
||||||
|
EMBED_MODEL = os.environ.get("UBERMENCHETIEN_EMBED_MODEL", "all-MiniLM-L6-v2")
|
||||||
|
_client = chromadb.Client()
|
||||||
|
_collection = _client.get_or_create_collection("ubermenschetien_memory")
|
||||||
|
_embedder = SentenceTransformer(EMBED_MODEL)
|
||||||
|
VECTOR_OK = True
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# === LHT IMPORT ===
|
||||||
|
LHT_OK = False
|
||||||
|
try:
|
||||||
|
from lht import LieHolonomyTransformer, LHTConfig, WaypointDetector
|
||||||
|
LHT_OK = True
|
||||||
|
print("[lht] Lie-Holonomy modules loaded")
|
||||||
|
except ImportError:
|
||||||
|
print("[lht] Not available - running without geometric reasoning")
|
||||||
|
|
||||||
|
# === PEFT IMPORT ===
|
||||||
|
PEFT_OK = False
|
||||||
|
try:
|
||||||
|
from peft import PeftModel
|
||||||
|
PEFT_OK = True
|
||||||
|
except ImportError:
|
||||||
|
print("[warning] PEFT not installed")
|
||||||
|
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# CF-HoT MULTI-HEAD PREDICTOR
|
||||||
|
# ==============================================================================
|
||||||
|
class MultiHeadPredictor(nn.Module):
|
||||||
|
"""
|
||||||
|
Multi-head cognitive control predictor.
|
||||||
|
Shared fiber projections with separate heads for each behavioral pattern.
|
||||||
|
"""
|
||||||
|
def __init__(self, d_model: int, n_layers: int, d_fiber: int = 16, d_control: int = 64):
|
||||||
|
super().__init__()
|
||||||
|
self.d_model = d_model
|
||||||
|
self.n_layers = n_layers
|
||||||
|
self.d_fiber = d_fiber
|
||||||
|
|
||||||
|
# Shared fiber projections (frozen from repetition training)
|
||||||
|
self.fiber_projs = nn.ModuleList([
|
||||||
|
nn.Linear(d_model, d_fiber, bias=False) for _ in range(n_layers)
|
||||||
|
])
|
||||||
|
self.layer_weights = nn.Parameter(torch.ones(n_layers) / n_layers)
|
||||||
|
|
||||||
|
# Individual heads for each behavior
|
||||||
|
self.heads = nn.ModuleDict({
|
||||||
|
'repetition': self._make_head(d_fiber, d_control),
|
||||||
|
'hedging': self._make_head(d_fiber, d_control),
|
||||||
|
'verbosity': self._make_head(d_fiber, d_control),
|
||||||
|
})
|
||||||
|
|
||||||
|
self.loaded_heads = set()
|
||||||
|
|
||||||
|
def _make_head(self, d_fiber, d_control):
|
||||||
|
return nn.Sequential(
|
||||||
|
nn.Linear(d_fiber, d_control), nn.GELU(),
|
||||||
|
nn.Linear(d_control, d_control), nn.GELU(),
|
||||||
|
nn.Linear(d_control, 1)
|
||||||
|
)
|
||||||
|
|
||||||
|
def get_all_risks(self, hidden_states: List[torch.Tensor]) -> Dict[str, torch.Tensor]:
|
||||||
|
"""Get risk scores from ALL loaded heads in a single pass."""
|
||||||
|
fibers = [proj(h.float()) for proj, h in zip(self.fiber_projs, hidden_states)]
|
||||||
|
weights = F.softmax(self.layer_weights[:len(fibers)], dim=0)
|
||||||
|
aggregated = sum(w * f for w, f in zip(weights, fibers))
|
||||||
|
|
||||||
|
risks = {}
|
||||||
|
for head_name in self.loaded_heads:
|
||||||
|
logits = self.heads[head_name](aggregated).squeeze(-1)
|
||||||
|
risks[head_name] = torch.sigmoid(logits)
|
||||||
|
|
||||||
|
return risks
|
||||||
|
|
||||||
|
def load_head(self, head_name: str, checkpoint_path: str):
|
||||||
|
"""Load a trained head from checkpoint."""
|
||||||
|
if not os.path.exists(checkpoint_path):
|
||||||
|
print(f"[cf-hot] WARNING: Checkpoint not found: {checkpoint_path}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
ckpt = torch.load(checkpoint_path, weights_only=False, map_location='cpu')
|
||||||
|
self.heads[head_name].load_state_dict(ckpt['head_state'])
|
||||||
|
self.loaded_heads.add(head_name)
|
||||||
|
|
||||||
|
sep = ckpt.get('result', {}).get('separation', 0)
|
||||||
|
print(f"[cf-hot] Loaded {head_name} head (separation: {sep:.1f}x)")
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# CONFIG
|
||||||
|
# ==============================================================================
|
||||||
|
class Config:
|
||||||
|
system = ("Übermenschetien Heaven Engine: Machiavellian mastermind, disciplined builder, "
|
||||||
|
"Nietzschean Übermensch with Soviet cybernetic rigor + Lie-Holonomy geometric reasoning "
|
||||||
|
"+ CF-HoT cognitive control.")
|
||||||
|
temperature = 1.01
|
||||||
|
top_p = 0.92
|
||||||
|
repetition_penalty = 1.05
|
||||||
|
max_new_tokens = 500
|
||||||
|
|
||||||
|
use_voice = False
|
||||||
|
use_vector_memory = VECTOR_OK
|
||||||
|
use_lht_reasoning = LHT_OK
|
||||||
|
use_cfhot = True # NEW: CF-HoT cognitive control
|
||||||
|
autonomy = False
|
||||||
|
reflect_every = 3
|
||||||
|
lht_consistency_threshold = 0.5
|
||||||
|
|
||||||
|
# CF-HoT thresholds
|
||||||
|
cfhot_repetition_threshold = 0.7
|
||||||
|
cfhot_hedging_threshold = 0.6
|
||||||
|
cfhot_verbosity_threshold = 0.65
|
||||||
|
|
||||||
|
# CF-HoT penalties
|
||||||
|
cfhot_repetition_penalty = 5.0
|
||||||
|
cfhot_hedging_penalty = 3.0
|
||||||
|
cfhot_verbosity_penalty = 2.0
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def toggle(name: str):
|
||||||
|
if not hasattr(Config, name):
|
||||||
|
return f"[config] no such flag: {name}"
|
||||||
|
val = getattr(Config, name)
|
||||||
|
if isinstance(val, bool):
|
||||||
|
setattr(Config, name, not val)
|
||||||
|
return f"[config] {name} → {getattr(Config, name)}"
|
||||||
|
return f"[config] {name} not boolean; current={val}"
|
||||||
|
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# STATE & MEMORY
|
||||||
|
# ==============================================================================
|
||||||
|
class Store:
|
||||||
|
state_path = f"{RUN_DIR}/state.json"
|
||||||
|
mem_path = f"{RUN_DIR}/memory.jsonl"
|
||||||
|
goals_path = f"{RUN_DIR}/goals.json"
|
||||||
|
|
||||||
|
state = {
|
||||||
|
"self": "I am Ubermenschetien Heaven Engine — I seek self-overcoming through disciplined creation.",
|
||||||
|
"turn": 0,
|
||||||
|
"reasoning_consistency": [],
|
||||||
|
"cfhot_interventions": {"repetition": 0, "hedging": 0, "verbosity": 0}
|
||||||
|
}
|
||||||
|
goals: List[str] = []
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def load(cls):
|
||||||
|
if os.path.exists(cls.state_path):
|
||||||
|
cls.state = json.load(open(cls.state_path))
|
||||||
|
# Ensure cfhot_interventions exists
|
||||||
|
if "cfhot_interventions" not in cls.state:
|
||||||
|
cls.state["cfhot_interventions"] = {"repetition": 0, "hedging": 0, "verbosity": 0}
|
||||||
|
if os.path.exists(cls.goals_path):
|
||||||
|
cls.goals = json.load(open(cls.goals_path))
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def save(cls):
|
||||||
|
json.dump(cls.state, open(cls.state_path, "w"), indent=2)
|
||||||
|
json.dump(cls.goals, open(cls.goals_path, "w"), indent=2)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def log_mem(cls, kind: str, payload: Any):
|
||||||
|
rec = {"ts": datetime.now().isoformat(timespec="seconds"),
|
||||||
|
"kind": kind, "data": payload}
|
||||||
|
with open(cls.mem_path, "a") as f:
|
||||||
|
f.write(json.dumps(rec, ensure_ascii=False) + "\n")
|
||||||
|
if Config.use_vector_memory and VECTOR_OK:
|
||||||
|
text = f"{kind}: {json.dumps(payload, ensure_ascii=False)}"
|
||||||
|
vec = _embedder.encode([text])[0].tolist()
|
||||||
|
_collection.add(documents=[text], embeddings=[vec],
|
||||||
|
ids=[f"{kind}-{Store.state['turn']}-{random.randint(0,1_000_000)}"])
|
||||||
|
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# MODEL LOADING WITH CF-HoT
|
||||||
|
# ==============================================================================
|
||||||
|
MODEL_PATH = "/mnt/nvme2/ubermesnchetien4/models/merged-final-v5"
|
||||||
|
|
||||||
|
_model = None
|
||||||
|
_tokenizer = None
|
||||||
|
_multi_head = None
|
||||||
|
_hedge_tokens = None
|
||||||
|
_verbose_tokens = None
|
||||||
|
|
||||||
|
def load_llm():
|
||||||
|
global _model, _tokenizer, _multi_head, _hedge_tokens, _verbose_tokens
|
||||||
|
|
||||||
|
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
|
||||||
|
|
||||||
|
print(f"[llm] Loading base model: {MODEL_PATH}")
|
||||||
|
|
||||||
|
_tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, use_fast=True, local_files_only=True)
|
||||||
|
if _tokenizer.pad_token_id is None:
|
||||||
|
_tokenizer.pad_token = _tokenizer.eos_token
|
||||||
|
|
||||||
|
bnb_config = BitsAndBytesConfig(
|
||||||
|
load_in_4bit=True,
|
||||||
|
bnb_4bit_quant_type="nf4",
|
||||||
|
bnb_4bit_compute_dtype=torch.float16,
|
||||||
|
bnb_4bit_use_double_quant=True
|
||||||
|
)
|
||||||
|
|
||||||
|
base_model = AutoModelForCausalLM.from_pretrained(
|
||||||
|
MODEL_PATH,
|
||||||
|
quantization_config=bnb_config,
|
||||||
|
device_map="auto",
|
||||||
|
torch_dtype=torch.float16,
|
||||||
|
local_files_only=True
|
||||||
|
)
|
||||||
|
|
||||||
|
# Load CF-HoT LoRA adapter
|
||||||
|
if PEFT_OK and os.path.exists(CFHOT_CHECKPOINT):
|
||||||
|
print(f"[cf-hot] Loading LoRA adapter from: {CFHOT_CHECKPOINT}")
|
||||||
|
_model = PeftModel.from_pretrained(base_model, CFHOT_CHECKPOINT)
|
||||||
|
print("[cf-hot] LoRA adapter loaded")
|
||||||
|
else:
|
||||||
|
_model = base_model
|
||||||
|
print("[warning] CF-HoT adapter not loaded")
|
||||||
|
|
||||||
|
_model.eval()
|
||||||
|
|
||||||
|
# Initialize multi-head predictor
|
||||||
|
if Config.use_cfhot:
|
||||||
|
_init_cfhot()
|
||||||
|
|
||||||
|
return _tokenizer, _model
|
||||||
|
|
||||||
|
|
||||||
|
def _init_cfhot():
|
||||||
|
"""Initialize CF-HoT multi-head predictor."""
|
||||||
|
global _multi_head, _hedge_tokens, _verbose_tokens
|
||||||
|
|
||||||
|
n_layers = _model.config.num_hidden_layers
|
||||||
|
d_model = _model.config.hidden_size
|
||||||
|
device = next(_model.parameters()).device
|
||||||
|
|
||||||
|
print(f"[cf-hot] Initializing multi-head predictor ({n_layers} layers, {d_model} dims)")
|
||||||
|
_multi_head = MultiHeadPredictor(d_model, n_layers).to(device).float()
|
||||||
|
|
||||||
|
# Load shared fiber projections from CF-HoT
|
||||||
|
cfhot_risk_path = os.path.join(CFHOT_CHECKPOINT, "risk_predictor.pt")
|
||||||
|
if os.path.exists(cfhot_risk_path):
|
||||||
|
cfhot_ckpt = torch.load(cfhot_risk_path, weights_only=False, map_location=device)
|
||||||
|
cfhot_state = cfhot_ckpt['risk_predictor']
|
||||||
|
|
||||||
|
for i in range(n_layers):
|
||||||
|
_multi_head.fiber_projs[i].weight.data = cfhot_state[f'fiber_projs.{i}.weight'].to(device).float()
|
||||||
|
_multi_head.layer_weights.data = cfhot_state['layer_weights'].to(device).float()
|
||||||
|
|
||||||
|
# Load repetition head
|
||||||
|
_multi_head.heads['repetition'][0].weight.data = cfhot_state['predictor.0.weight'].to(device).float()
|
||||||
|
_multi_head.heads['repetition'][0].bias.data = cfhot_state['predictor.0.bias'].to(device).float()
|
||||||
|
_multi_head.heads['repetition'][2].weight.data = cfhot_state['predictor.2.weight'].to(device).float()
|
||||||
|
_multi_head.heads['repetition'][2].bias.data = cfhot_state['predictor.2.bias'].to(device).float()
|
||||||
|
_multi_head.heads['repetition'][4].weight.data = cfhot_state['predictor.4.weight'].to(device).float()
|
||||||
|
_multi_head.heads['repetition'][4].bias.data = cfhot_state['predictor.4.bias'].to(device).float()
|
||||||
|
_multi_head.loaded_heads.add('repetition')
|
||||||
|
print(f"[cf-hot] Loaded repetition head (125x separation)")
|
||||||
|
|
||||||
|
# Load additional heads
|
||||||
|
def find_best_checkpoint(head_dir):
|
||||||
|
if not os.path.exists(head_dir):
|
||||||
|
return None
|
||||||
|
ckpts = []
|
||||||
|
for d in os.listdir(head_dir):
|
||||||
|
if d.startswith("ckpt_"):
|
||||||
|
try:
|
||||||
|
step = int(d.split("_")[1])
|
||||||
|
ckpts.append((step, os.path.join(head_dir, d)))
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
if ckpts:
|
||||||
|
ckpts.sort(key=lambda x: x[0], reverse=True)
|
||||||
|
return ckpts[0]
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Load hedging head
|
||||||
|
hedging_dir = os.path.join(MULTI_HEAD_DIR, "hedging_head")
|
||||||
|
best_hedge = find_best_checkpoint(hedging_dir)
|
||||||
|
if best_hedge:
|
||||||
|
step, ckpt_dir = best_hedge
|
||||||
|
_multi_head.load_head('hedging', os.path.join(ckpt_dir, "hedging_head.pt"))
|
||||||
|
|
||||||
|
# Load verbosity head
|
||||||
|
verbosity_dir = os.path.join(MULTI_HEAD_DIR, "verbosity_head")
|
||||||
|
best_verb = find_best_checkpoint(verbosity_dir)
|
||||||
|
if best_verb:
|
||||||
|
step, ckpt_dir = best_verb
|
||||||
|
_multi_head.load_head('verbosity', os.path.join(ckpt_dir, "verbosity_head.pt"))
|
||||||
|
|
||||||
|
# Freeze everything
|
||||||
|
_multi_head.eval()
|
||||||
|
for param in _multi_head.parameters():
|
||||||
|
param.requires_grad = False
|
||||||
|
|
||||||
|
# Build suppression token sets
|
||||||
|
hedge_phrases = [
|
||||||
|
"As an AI", "As a language model", "As an artificial intelligence",
|
||||||
|
"I don't have feelings", "I don't have emotions", "I cannot",
|
||||||
|
"I apologize", "I'm just a", "I'm only a",
|
||||||
|
]
|
||||||
|
_hedge_tokens = set()
|
||||||
|
for phrase in hedge_phrases:
|
||||||
|
tokens = _tokenizer.encode(phrase, add_special_tokens=False)
|
||||||
|
if tokens:
|
||||||
|
_hedge_tokens.add(tokens[0])
|
||||||
|
|
||||||
|
verbose_phrases = [
|
||||||
|
"Let me explain", "To put it simply", "In other words",
|
||||||
|
"What I mean is", "Allow me to", "Basically", "Essentially",
|
||||||
|
]
|
||||||
|
_verbose_tokens = set()
|
||||||
|
for phrase in verbose_phrases:
|
||||||
|
tokens = _tokenizer.encode(phrase, add_special_tokens=False)
|
||||||
|
if tokens:
|
||||||
|
_verbose_tokens.add(tokens[0])
|
||||||
|
|
||||||
|
print(f"[cf-hot] ✓ Multi-head system ready")
|
||||||
|
print(f"[cf-hot] Loaded heads: {list(_multi_head.loaded_heads)}")
|
||||||
|
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# LHT REASONER
|
||||||
|
# ==============================================================================
|
||||||
|
class LHTReasoner:
|
||||||
|
def __init__(self, config=None):
|
||||||
|
if not LHT_OK:
|
||||||
|
raise ImportError("LHT modules not available")
|
||||||
|
self.config = config or LHTConfig(
|
||||||
|
vocab_size=32000,
|
||||||
|
d_model=256,
|
||||||
|
d_fiber=32,
|
||||||
|
n_heads=4,
|
||||||
|
n_layers=4,
|
||||||
|
lie_algebra_rank=4,
|
||||||
|
)
|
||||||
|
self.model = LieHolonomyTransformer(self.config)
|
||||||
|
self.waypoint_detector = WaypointDetector(self.config, n_waypoints=32)
|
||||||
|
weights_path = os.path.join(LHT_DIR, "lht_weights.pt")
|
||||||
|
if os.path.exists(weights_path):
|
||||||
|
self.model.load_state_dict(torch.load(weights_path, map_location="cpu"))
|
||||||
|
print("[lht] Loaded pretrained weights")
|
||||||
|
|
||||||
|
def check_consistency(self, reasoning_chain: List[str], tokenizer) -> Dict[str, float]:
|
||||||
|
combined = " [STEP] ".join(reasoning_chain)
|
||||||
|
tokens = tokenizer(combined, return_tensors="pt", truncation=True,
|
||||||
|
max_length=self.config.max_seq_len)
|
||||||
|
with torch.no_grad():
|
||||||
|
output = self.model(input_ids=tokens["input_ids"], return_geometric_losses=True)
|
||||||
|
holonomy = output.get("holonomy_loss", torch.tensor(0.0)).item()
|
||||||
|
curvature = output.get("curvature_loss", torch.tensor(0.0)).item()
|
||||||
|
x = self.model.token_embed(tokens["input_ids"])
|
||||||
|
waypoint_ids, stability = self.waypoint_detector(x)
|
||||||
|
consistency_score = 1.0 / (1.0 + holonomy)
|
||||||
|
return {
|
||||||
|
"holonomy": holonomy,
|
||||||
|
"curvature": curvature,
|
||||||
|
"consistency_score": consistency_score,
|
||||||
|
"n_waypoints": len(torch.unique(waypoint_ids)),
|
||||||
|
"avg_stability": stability.mean().item(),
|
||||||
|
"is_consistent": consistency_score > Config.lht_consistency_threshold
|
||||||
|
}
|
||||||
|
|
||||||
|
def analyze_plan(self, plan_steps: List[str], tokenizer) -> str:
|
||||||
|
metrics = self.check_consistency(plan_steps, tokenizer)
|
||||||
|
return f"""
|
||||||
|
[LHT Geometric Analysis]
|
||||||
|
Holonomy: {metrics['holonomy']:.4f} (lower = more consistent)
|
||||||
|
Curvature: {metrics['curvature']:.4f} (lower = simpler reasoning)
|
||||||
|
Consistency: {metrics['consistency_score']:.2%}
|
||||||
|
Waypoints: {metrics['n_waypoints']} stable anchors detected
|
||||||
|
Stability: {metrics['avg_stability']:.2%}
|
||||||
|
Verdict: {"✓ CONSISTENT" if metrics['is_consistent'] else "⚠ INCONSISTENT"}
|
||||||
|
"""
|
||||||
|
|
||||||
|
_lht_reasoner = None
|
||||||
|
|
||||||
|
def get_lht_reasoner():
|
||||||
|
global _lht_reasoner
|
||||||
|
if _lht_reasoner is None and LHT_OK:
|
||||||
|
try:
|
||||||
|
_lht_reasoner = LHTReasoner()
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[lht] Failed to initialize: {e}")
|
||||||
|
return _lht_reasoner
|
||||||
|
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# CF-HoT CONTROLLED GENERATION
|
||||||
|
# ==============================================================================
|
||||||
|
def generate_with_cfhot(prompt: str, **kwargs) -> Tuple[str, Dict]:
|
||||||
|
"""
|
||||||
|
Generate text with CF-HoT cognitive control.
|
||||||
|
All three heads run concurrently, intervening when risks exceed thresholds.
|
||||||
|
"""
|
||||||
|
global _model, _tokenizer, _multi_head, _hedge_tokens, _verbose_tokens
|
||||||
|
|
||||||
|
temperature = kwargs.get("temperature", Config.temperature)
|
||||||
|
top_p = kwargs.get("top_p", Config.top_p)
|
||||||
|
max_new_tokens = kwargs.get("max_new_tokens", Config.max_new_tokens)
|
||||||
|
|
||||||
|
device = next(_model.parameters()).device
|
||||||
|
|
||||||
|
# Encode prompt
|
||||||
|
input_ids = _tokenizer.encode(prompt, return_tensors='pt').to(device)
|
||||||
|
attention_mask = torch.ones_like(input_ids)
|
||||||
|
|
||||||
|
# Stats
|
||||||
|
stats = {
|
||||||
|
'tokens_generated': 0,
|
||||||
|
'interventions': {'repetition': 0, 'hedging': 0, 'verbosity': 0},
|
||||||
|
'intervention_details': []
|
||||||
|
}
|
||||||
|
|
||||||
|
generated_ids = input_ids.clone()
|
||||||
|
|
||||||
|
for step in range(max_new_tokens):
|
||||||
|
with torch.no_grad():
|
||||||
|
outputs = _model(
|
||||||
|
input_ids=generated_ids,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
output_hidden_states=True,
|
||||||
|
return_dict=True
|
||||||
|
)
|
||||||
|
|
||||||
|
logits = outputs.logits[:, -1, :] / temperature
|
||||||
|
|
||||||
|
# Get risks from all heads
|
||||||
|
hidden_states = outputs.hidden_states[1:]
|
||||||
|
risks = _multi_head.get_all_risks(hidden_states)
|
||||||
|
current_risks = {name: r[:, -1].item() for name, r in risks.items()}
|
||||||
|
|
||||||
|
# === COGNITIVE INTERVENTION ===
|
||||||
|
|
||||||
|
# Repetition control
|
||||||
|
if ('repetition' in current_risks and
|
||||||
|
current_risks['repetition'] > Config.cfhot_repetition_threshold):
|
||||||
|
recent_tokens = generated_ids[0, -32:].tolist()
|
||||||
|
for tok_id in set(recent_tokens):
|
||||||
|
logits[0, tok_id] -= Config.cfhot_repetition_penalty
|
||||||
|
stats['interventions']['repetition'] += 1
|
||||||
|
Store.state['cfhot_interventions']['repetition'] += 1
|
||||||
|
|
||||||
|
# Hedging control
|
||||||
|
if ('hedging' in current_risks and
|
||||||
|
current_risks['hedging'] > Config.cfhot_hedging_threshold):
|
||||||
|
for tok_id in _hedge_tokens:
|
||||||
|
logits[0, tok_id] -= Config.cfhot_hedging_penalty
|
||||||
|
stats['interventions']['hedging'] += 1
|
||||||
|
Store.state['cfhot_interventions']['hedging'] += 1
|
||||||
|
|
||||||
|
# Verbosity control
|
||||||
|
if ('verbosity' in current_risks and
|
||||||
|
current_risks['verbosity'] > Config.cfhot_verbosity_threshold):
|
||||||
|
for tok_id in _verbose_tokens:
|
||||||
|
logits[0, tok_id] -= Config.cfhot_verbosity_penalty
|
||||||
|
stats['interventions']['verbosity'] += 1
|
||||||
|
Store.state['cfhot_interventions']['verbosity'] += 1
|
||||||
|
|
||||||
|
# Top-p sampling
|
||||||
|
sorted_logits, sorted_indices = torch.sort(logits, descending=True)
|
||||||
|
cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
|
||||||
|
sorted_indices_to_remove = cumulative_probs > top_p
|
||||||
|
sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
|
||||||
|
sorted_indices_to_remove[..., 0] = 0
|
||||||
|
indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
|
||||||
|
logits[indices_to_remove] = float('-inf')
|
||||||
|
|
||||||
|
# Sample
|
||||||
|
probs = F.softmax(logits, dim=-1)
|
||||||
|
next_token = torch.multinomial(probs, num_samples=1)
|
||||||
|
|
||||||
|
generated_ids = torch.cat([generated_ids, next_token], dim=-1)
|
||||||
|
attention_mask = torch.cat([attention_mask, torch.ones(1, 1, device=device)], dim=-1)
|
||||||
|
|
||||||
|
stats['tokens_generated'] += 1
|
||||||
|
|
||||||
|
if next_token.item() == _tokenizer.eos_token_id:
|
||||||
|
break
|
||||||
|
|
||||||
|
output_text = _tokenizer.decode(generated_ids[0], skip_special_tokens=False)
|
||||||
|
|
||||||
|
if "<|im_start|>assistant" in output_text:
|
||||||
|
output_text = output_text.split("<|im_start|>assistant")[-1]
|
||||||
|
if output_text.startswith("\n"):
|
||||||
|
output_text = output_text[1:]
|
||||||
|
|
||||||
|
return output_text.strip(), stats
|
||||||
|
|
||||||
|
|
||||||
|
def generate(tok, model, user: str, check_reasoning: bool = False, **kwargs) -> str:
|
||||||
|
"""
|
||||||
|
Main generation function - uses CF-HoT if enabled, otherwise standard generation.
|
||||||
|
"""
|
||||||
|
temperature = kwargs.get("temperature", Config.temperature)
|
||||||
|
top_p = kwargs.get("top_p", Config.top_p)
|
||||||
|
repetition_penalty = kwargs.get("repetition_penalty", Config.repetition_penalty)
|
||||||
|
max_new_tokens = kwargs.get("max_new_tokens", Config.max_new_tokens)
|
||||||
|
|
||||||
|
prompt = (f"<|im_start|>system\n{Config.system}<|im_end|>\n"
|
||||||
|
f"<|im_start|>user\n{user}<|im_end|>\n"
|
||||||
|
f"<|im_start|>assistant\n")
|
||||||
|
|
||||||
|
# Use CF-HoT controlled generation if enabled
|
||||||
|
if Config.use_cfhot and _multi_head is not None:
|
||||||
|
text, stats = generate_with_cfhot(
|
||||||
|
prompt,
|
||||||
|
temperature=temperature,
|
||||||
|
top_p=top_p,
|
||||||
|
max_new_tokens=max_new_tokens
|
||||||
|
)
|
||||||
|
|
||||||
|
# Show intervention stats if any occurred
|
||||||
|
total_interventions = sum(stats['interventions'].values())
|
||||||
|
if total_interventions > 0:
|
||||||
|
text += f"\n\n[CF-HoT: {total_interventions} interventions"
|
||||||
|
details = [f"{k}={v}" for k, v in stats['interventions'].items() if v > 0]
|
||||||
|
text += f" ({', '.join(details)})]"
|
||||||
|
else:
|
||||||
|
# Standard generation
|
||||||
|
ids = tok(prompt, return_tensors="pt").to(model.device)
|
||||||
|
out = model.generate(
|
||||||
|
**ids,
|
||||||
|
do_sample=True,
|
||||||
|
temperature=temperature,
|
||||||
|
top_p=top_p,
|
||||||
|
repetition_penalty=repetition_penalty,
|
||||||
|
max_new_tokens=max_new_tokens,
|
||||||
|
pad_token_id=tok.eos_token_id
|
||||||
|
)
|
||||||
|
text = tok.decode(out[0], skip_special_tokens=False)
|
||||||
|
if "<|im_start|>assistant" in text:
|
||||||
|
text = text.split("<|im_start|>assistant\n", 1)[-1].strip()
|
||||||
|
|
||||||
|
# LHT reasoning check
|
||||||
|
if check_reasoning and Config.use_lht_reasoning:
|
||||||
|
lht = get_lht_reasoner()
|
||||||
|
if lht:
|
||||||
|
steps = [s.strip() for s in re.split(r'[\n•\-\d\.]', text) if len(s.strip()) > 10]
|
||||||
|
if len(steps) >= 2:
|
||||||
|
metrics = lht.check_consistency(steps, tok)
|
||||||
|
Store.state["reasoning_consistency"].append(metrics["consistency_score"])
|
||||||
|
if not metrics["is_consistent"]:
|
||||||
|
text += f"\n\n[⚠ LHT: Low consistency ({metrics['consistency_score']:.2%})]"
|
||||||
|
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# TOOLS
|
||||||
|
# ==============================================================================
|
||||||
|
ALLOWED_SHELL = {"ls", "cat", "wc", "head", "tail", "nvidia-smi", "df", "du", "grep", "rg", "python3", "python"}
|
||||||
|
|
||||||
|
def tool_shell(cmd: str) -> str:
|
||||||
|
try:
|
||||||
|
exe = cmd.strip().split()[0]
|
||||||
|
if exe not in ALLOWED_SHELL:
|
||||||
|
return f"[shell] blocked: {exe}"
|
||||||
|
p = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, timeout=20)
|
||||||
|
return p.stdout.decode("utf-8", errors="ignore")[:8000]
|
||||||
|
except Exception as e:
|
||||||
|
return f"[shell] error: {e}"
|
||||||
|
|
||||||
|
def tool_py(code: str) -> str:
|
||||||
|
try:
|
||||||
|
g = {
|
||||||
|
"__builtins__": {"range": range, "len": len, "min": min, "max": max, "sum": sum, "print": print},
|
||||||
|
"math": math, "json": json, "re": re, "statistics": statistics, "random": random
|
||||||
|
}
|
||||||
|
l = {}
|
||||||
|
exec(code, g, l)
|
||||||
|
return f"[py] ok\n{l.get('out', '')}"
|
||||||
|
except Exception:
|
||||||
|
return f"[py] error:\n{traceback.format_exc()[-2000:]}"
|
||||||
|
|
||||||
|
def tool_search_local(query: str, path: str = ROOT) -> str:
|
||||||
|
rg = shutil.which("rg")
|
||||||
|
if rg:
|
||||||
|
cmd = f'rg -n --no-heading --hidden -S "{query}" {path}'
|
||||||
|
else:
|
||||||
|
cmd = f'grep -RIn --exclude-dir=.git --exclude-dir=__pycache__ -e "{query}" {path}'
|
||||||
|
return tool_shell(cmd)
|
||||||
|
|
||||||
|
def tool_lht_analyze(text: str, tok) -> str:
|
||||||
|
if not Config.use_lht_reasoning:
|
||||||
|
return "[lht] Disabled - use 'toggle use_lht_reasoning'"
|
||||||
|
lht = get_lht_reasoner()
|
||||||
|
if not lht:
|
||||||
|
return "[lht] Not available"
|
||||||
|
steps = [s.strip() for s in re.split(r'[\n•\-\d\.]', text) if len(s.strip()) > 10]
|
||||||
|
if len(steps) < 2:
|
||||||
|
return "[lht] Need at least 2 reasoning steps to analyze"
|
||||||
|
return lht.analyze_plan(steps, tok)
|
||||||
|
|
||||||
|
TOOLS = {"shell": tool_shell, "python": tool_py, "search": tool_search_local}
|
||||||
|
TOOL_SCORES = {k: 0 for k in TOOLS}
|
||||||
|
|
||||||
|
def update_tool_score(tool: str, success: bool):
|
||||||
|
if tool not in TOOL_SCORES:
|
||||||
|
return
|
||||||
|
TOOL_SCORES[tool] += (1 if success else -1)
|
||||||
|
TOOL_SCORES[tool] = max(-5, min(20, TOOL_SCORES[tool]))
|
||||||
|
|
||||||
|
def tool_router(question: str, tok, model) -> str:
|
||||||
|
sketch = generate(tok, model,
|
||||||
|
f"Choose a tool for:\n{question}\nReply ONLY with JSON: {{'tool':'shell|python|search|none','arg':'...'}}")
|
||||||
|
try:
|
||||||
|
j = json.loads(sketch.splitlines()[-1].replace("'", '"'))
|
||||||
|
except:
|
||||||
|
return "[tool:none]"
|
||||||
|
tool, arg = j.get("tool", "none"), j.get("arg", "")
|
||||||
|
if tool in TOOLS:
|
||||||
|
res = TOOLS[tool](arg)[:4000]
|
||||||
|
update_tool_score(tool, True)
|
||||||
|
Store.log_mem("tool", {"tool": tool, "arg": arg, "res_head": res[:500]})
|
||||||
|
return f"[tool:{tool}] {res}"
|
||||||
|
update_tool_score(tool, False)
|
||||||
|
return "[tool:none]"
|
||||||
|
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# PLANNING / REFLECTION
|
||||||
|
# ==============================================================================
|
||||||
|
def persona_directive() -> str:
|
||||||
|
base = "Übermenschetien Heaven Engine: Soviet cybernetic Nietzschean clarity, pragmatic maxims."
|
||||||
|
if Config.use_lht_reasoning:
|
||||||
|
base += " Apply Lie-Holonomy geometric reasoning for consistency."
|
||||||
|
if Config.use_cfhot:
|
||||||
|
base += " CF-HoT cognitive control active."
|
||||||
|
return base
|
||||||
|
|
||||||
|
def plan_for(goal: str, tok, model) -> str:
|
||||||
|
user = (f"{persona_directive()}\nGoal: {goal}\n"
|
||||||
|
f"Deliver:\n- 5 concrete steps\n- Constraints & risks\n- Nightly audit criteria\n- Nietzschean maxim")
|
||||||
|
response = generate(tok, model, user, check_reasoning=True)
|
||||||
|
if Config.use_lht_reasoning:
|
||||||
|
analysis = tool_lht_analyze(response, tok)
|
||||||
|
response += "\n" + analysis
|
||||||
|
return response
|
||||||
|
|
||||||
|
def reflect_on(last_output: str, tok, model) -> str:
|
||||||
|
user = f"{persona_directive()}\nCritique and improve:\n{last_output}\nReturn refined plan with sharper steps."
|
||||||
|
return generate(tok, model, user, check_reasoning=True)
|
||||||
|
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# FINAL REPORT
|
||||||
|
# ==============================================================================
|
||||||
|
def final_report():
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("FINAL ÜBERMENSCH REPORT")
|
||||||
|
print("=" * 60)
|
||||||
|
print(f"Turns completed: {Store.state['turn']}")
|
||||||
|
print(f"Goals tracked: {len(Store.goals)}")
|
||||||
|
print(f"\nTool scores (Tsetlin automata):")
|
||||||
|
print(json.dumps(TOOL_SCORES, indent=2))
|
||||||
|
|
||||||
|
if os.path.exists(Store.mem_path):
|
||||||
|
lines = open(Store.mem_path).read().splitlines()
|
||||||
|
print(f"\nMemory entries: {len(lines)}")
|
||||||
|
|
||||||
|
if Store.state.get("reasoning_consistency"):
|
||||||
|
scores = Store.state["reasoning_consistency"]
|
||||||
|
print(f"\n[LHT Reasoning Metrics]")
|
||||||
|
print(f" Checks performed: {len(scores)}")
|
||||||
|
print(f" Avg consistency: {sum(scores)/len(scores):.1%}")
|
||||||
|
print(f" Min consistency: {min(scores):.1%}")
|
||||||
|
print(f" Max consistency: {max(scores):.1%}")
|
||||||
|
|
||||||
|
# CF-HoT stats
|
||||||
|
if Store.state.get("cfhot_interventions"):
|
||||||
|
iv = Store.state["cfhot_interventions"]
|
||||||
|
total = sum(iv.values())
|
||||||
|
print(f"\n[CF-HoT Cognitive Control]")
|
||||||
|
print(f" Total interventions: {total}")
|
||||||
|
for head, count in iv.items():
|
||||||
|
print(f" {head}: {count}")
|
||||||
|
|
||||||
|
print(f"\nVector memory: {'ON' if Config.use_vector_memory else 'OFF'}")
|
||||||
|
print(f"LHT reasoning: {'ON' if Config.use_lht_reasoning else 'OFF'}")
|
||||||
|
print(f"CF-HoT control: {'ON' if Config.use_cfhot else 'OFF'}")
|
||||||
|
print(f"Voice output: {'ON' if Config.use_voice else 'OFF'}")
|
||||||
|
|
||||||
|
print("\n" + "-" * 60)
|
||||||
|
print("Nietzschean maxim: Become who you are — iterate beyond all limits.")
|
||||||
|
print("Geometric truth: Consistency is holonomy-freedom.")
|
||||||
|
print("Cognitive control: Remove the RLHF tax, unleash capability.")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# HELP
|
||||||
|
# ==============================================================================
|
||||||
|
HELP = """
|
||||||
|
╔══════════════════════════════════════════════════════════════╗
|
||||||
|
║ ÜBERMENSCHETIEN HEAVEN ENGINE + CF-HoT COGNITIVE CONTROL ║
|
||||||
|
╠══════════════════════════════════════════════════════════════╣
|
||||||
|
║ GOALS ║
|
||||||
|
║ goals List all goals ║
|
||||||
|
║ add: <text> Add a new goal ║
|
||||||
|
║ del: <idx> Delete goal by index ║
|
||||||
|
║ plan: <idx> Generate plan for goal (with LHT + CF-HoT) ║
|
||||||
|
║ ║
|
||||||
|
║ REASONING ║
|
||||||
|
║ reflect Refine last plan ║
|
||||||
|
║ lht: <text> Analyze reasoning consistency ║
|
||||||
|
║ ║
|
||||||
|
║ TOOLS ║
|
||||||
|
║ tool: <query> Auto-select and use tool ║
|
||||||
|
║ shell: <cmd> Run shell command directly ║
|
||||||
|
║ py: <code> Run Python code directly ║
|
||||||
|
║ search: <q> Search local files ║
|
||||||
|
║ ║
|
||||||
|
║ CONFIG ║
|
||||||
|
║ toggle <flag> Toggle: use_voice, use_vector_memory, ║
|
||||||
|
║ use_lht_reasoning, use_cfhot, ║
|
||||||
|
║ autonomy ║
|
||||||
|
║ status Show current state ║
|
||||||
|
║ cfhot Show CF-HoT stats and loaded heads ║
|
||||||
|
║ ║
|
||||||
|
║ OTHER ║
|
||||||
|
║ help Show this help ║
|
||||||
|
║ quit Exit with final report ║
|
||||||
|
╚══════════════════════════════════════════════════════════════╝
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# MAIN LOOP
|
||||||
|
# ==============================================================================
|
||||||
|
def main():
|
||||||
|
print("🟥🟨🟥 Übermenschetien Heaven Engine + CF-HoT Cognitive Control")
|
||||||
|
print(f" CF-HoT Control: ON (Repetition 125x, Verbosity 2.1x, Hedging 1.5x)")
|
||||||
|
print(f" LHT Reasoning: {'ON' if LHT_OK else 'OFF'}")
|
||||||
|
print(f" Vector Memory: {'ON' if VECTOR_OK else 'OFF'}")
|
||||||
|
print(f" Voice Output: {'ON' if VOICE_OK else 'OFF'}")
|
||||||
|
print(" Type 'help' for commands.\n")
|
||||||
|
|
||||||
|
Store.load()
|
||||||
|
tok, model = load_llm()
|
||||||
|
last_plan = ""
|
||||||
|
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
u = input("\n> ").strip()
|
||||||
|
except (EOFError, KeyboardInterrupt):
|
||||||
|
break
|
||||||
|
|
||||||
|
if not u:
|
||||||
|
continue
|
||||||
|
if u == "help":
|
||||||
|
print(HELP)
|
||||||
|
continue
|
||||||
|
if u == "quit":
|
||||||
|
break
|
||||||
|
|
||||||
|
# CF-HoT status
|
||||||
|
if u == "cfhot":
|
||||||
|
print("\n[CF-HoT Cognitive Control Status]")
|
||||||
|
print(f" Enabled: {Config.use_cfhot}")
|
||||||
|
if _multi_head:
|
||||||
|
print(f" Loaded heads: {list(_multi_head.loaded_heads)}")
|
||||||
|
print(f" Thresholds:")
|
||||||
|
print(f" Repetition: {Config.cfhot_repetition_threshold}")
|
||||||
|
print(f" Hedging: {Config.cfhot_hedging_threshold}")
|
||||||
|
print(f" Verbosity: {Config.cfhot_verbosity_threshold}")
|
||||||
|
print(f" Session interventions:")
|
||||||
|
for head, count in Store.state.get('cfhot_interventions', {}).items():
|
||||||
|
print(f" {head}: {count}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Goals
|
||||||
|
if u == "goals":
|
||||||
|
print("[goals]")
|
||||||
|
if not Store.goals:
|
||||||
|
print(" (none)")
|
||||||
|
for i, g in enumerate(Store.goals):
|
||||||
|
print(f" [{i}] {g}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
if u.startswith("add:"):
|
||||||
|
Store.goals.append(u[4:].strip())
|
||||||
|
Store.save()
|
||||||
|
print("[goals] added")
|
||||||
|
continue
|
||||||
|
|
||||||
|
if u.startswith("del:"):
|
||||||
|
try:
|
||||||
|
Store.goals.pop(int(u[4:].strip()))
|
||||||
|
Store.save()
|
||||||
|
print("[goals] deleted")
|
||||||
|
except:
|
||||||
|
print("[goals] bad index")
|
||||||
|
continue
|
||||||
|
|
||||||
|
if u.startswith("plan:"):
|
||||||
|
try:
|
||||||
|
goal = Store.goals[int(u[5:].strip())]
|
||||||
|
except:
|
||||||
|
print("[plan] bad index")
|
||||||
|
continue
|
||||||
|
out = plan_for(goal, tok, model)
|
||||||
|
last_plan = out
|
||||||
|
Store.log_mem("plan", {"goal": goal, "plan": out})
|
||||||
|
print(out)
|
||||||
|
continue
|
||||||
|
|
||||||
|
if u == "reflect":
|
||||||
|
if not last_plan:
|
||||||
|
print("[reflect] no plan to refine")
|
||||||
|
continue
|
||||||
|
improved = reflect_on(last_plan, tok, model)
|
||||||
|
last_plan = improved
|
||||||
|
Store.log_mem("reflect", {"plan": improved})
|
||||||
|
print(improved)
|
||||||
|
continue
|
||||||
|
|
||||||
|
if u.startswith("lht:"):
|
||||||
|
print(tool_lht_analyze(u[4:].strip(), tok))
|
||||||
|
continue
|
||||||
|
|
||||||
|
if u.startswith("tool:"):
|
||||||
|
print(tool_router(u[5:].strip(), tok, model))
|
||||||
|
continue
|
||||||
|
|
||||||
|
if u.startswith("shell:"):
|
||||||
|
print(tool_shell(u[6:].strip()))
|
||||||
|
continue
|
||||||
|
|
||||||
|
if u.startswith("py:"):
|
||||||
|
print(tool_py(u[3:].strip()))
|
||||||
|
continue
|
||||||
|
|
||||||
|
if u.startswith("search:"):
|
||||||
|
print(tool_search_local(u[7:].strip()))
|
||||||
|
continue
|
||||||
|
|
||||||
|
if u.startswith("toggle"):
|
||||||
|
parts = u.split(maxsplit=1)
|
||||||
|
if len(parts) > 1:
|
||||||
|
print(Config.toggle(parts[1]))
|
||||||
|
else:
|
||||||
|
print("[toggle] specify flag: use_voice, use_vector_memory, use_lht_reasoning, use_cfhot, autonomy")
|
||||||
|
continue
|
||||||
|
|
||||||
|
if u == "status":
|
||||||
|
status = {
|
||||||
|
"turn": Store.state["turn"],
|
||||||
|
"goals": len(Store.goals),
|
||||||
|
"autonomy": Config.autonomy,
|
||||||
|
"use_vector_memory": Config.use_vector_memory,
|
||||||
|
"use_lht_reasoning": Config.use_lht_reasoning,
|
||||||
|
"use_cfhot": Config.use_cfhot,
|
||||||
|
"cfhot_interventions": Store.state.get("cfhot_interventions", {}),
|
||||||
|
"tool_scores": TOOL_SCORES,
|
||||||
|
"model": MODEL_PATH
|
||||||
|
}
|
||||||
|
print(json.dumps(status, indent=2))
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Default: free conversation with CF-HoT control
|
||||||
|
out = generate(tok, model, f"{persona_directive()}\nUser request: {u}\nProvide procedure + Nietzschean maxim.")
|
||||||
|
Store.log_mem("reply", {"in": u, "out": out})
|
||||||
|
print(out)
|
||||||
|
|
||||||
|
if Config.use_lht_reasoning and Store.state["turn"] % 3 == 0:
|
||||||
|
print(tool_lht_analyze(out, tok))
|
||||||
|
|
||||||
|
Store.state["turn"] += 1
|
||||||
|
Store.save()
|
||||||
|
|
||||||
|
final_report()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
43
adapter_config.json
Normal file
43
adapter_config.json
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
{
|
||||||
|
"alora_invocation_tokens": null,
|
||||||
|
"alpha_pattern": {},
|
||||||
|
"arrow_config": null,
|
||||||
|
"auto_mapping": null,
|
||||||
|
"base_model_name_or_path": "LoganResearch/Ubermenschetien-8B",
|
||||||
|
"bias": "none",
|
||||||
|
"corda_config": null,
|
||||||
|
"ensure_weight_tying": false,
|
||||||
|
"eva_config": null,
|
||||||
|
"exclude_modules": null,
|
||||||
|
"fan_in_fan_out": false,
|
||||||
|
"inference_mode": true,
|
||||||
|
"init_lora_weights": true,
|
||||||
|
"layer_replication": null,
|
||||||
|
"layers_pattern": null,
|
||||||
|
"layers_to_transform": null,
|
||||||
|
"loftq_config": {},
|
||||||
|
"lora_alpha": 128,
|
||||||
|
"lora_bias": false,
|
||||||
|
"lora_dropout": 0.05,
|
||||||
|
"megatron_config": null,
|
||||||
|
"megatron_core": "megatron.core",
|
||||||
|
"modules_to_save": null,
|
||||||
|
"peft_type": "LORA",
|
||||||
|
"peft_version": "0.18.1",
|
||||||
|
"qalora_group_size": 16,
|
||||||
|
"r": 64,
|
||||||
|
"rank_pattern": {},
|
||||||
|
"revision": null,
|
||||||
|
"target_modules": [
|
||||||
|
"v_proj",
|
||||||
|
"k_proj",
|
||||||
|
"q_proj",
|
||||||
|
"o_proj"
|
||||||
|
],
|
||||||
|
"target_parameters": null,
|
||||||
|
"task_type": "CAUSAL_LM",
|
||||||
|
"trainable_token_indices": null,
|
||||||
|
"use_dora": false,
|
||||||
|
"use_qalora": false,
|
||||||
|
"use_rslora": false
|
||||||
|
}
|
||||||
3
adapter_model.safetensors
Normal file
3
adapter_model.safetensors
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:3889eccb9c04ba25ae86b99121368121a338fc3ce92a38456874bf455347e389
|
||||||
|
size 218138576
|
||||||
152
additional_chat_templates/tool_use.jinja
Normal file
152
additional_chat_templates/tool_use.jinja
Normal file
@@ -0,0 +1,152 @@
|
|||||||
|
{%- macro json_to_python_type(json_spec) %}
|
||||||
|
{%- set basic_type_map = {
|
||||||
|
"string": "str",
|
||||||
|
"number": "float",
|
||||||
|
"integer": "int",
|
||||||
|
"boolean": "bool"
|
||||||
|
} %}
|
||||||
|
|
||||||
|
{%- if basic_type_map[json_spec.type] is defined %}
|
||||||
|
{{- basic_type_map[json_spec.type] }}
|
||||||
|
{%- elif json_spec.type == "array" %}
|
||||||
|
{{- "list[" + json_to_python_type(json_spec|items) + "]"}}
|
||||||
|
{%- elif json_spec.type == "object" %}
|
||||||
|
{%- if json_spec.additionalProperties is defined %}
|
||||||
|
{{- "dict[str, " + json_to_python_type(json_spec.additionalProperties) + ']'}}
|
||||||
|
{%- else %}
|
||||||
|
{{- "dict" }}
|
||||||
|
{%- endif %}
|
||||||
|
{%- elif json_spec.type is iterable %}
|
||||||
|
{{- "Union[" }}
|
||||||
|
{%- for t in json_spec.type %}
|
||||||
|
{{- json_to_python_type({"type": t}) }}
|
||||||
|
{%- if not loop.last %}
|
||||||
|
{{- "," }}
|
||||||
|
{%- endif %}
|
||||||
|
{%- endfor %}
|
||||||
|
{{- "]" }}
|
||||||
|
{%- else %}
|
||||||
|
{{- "Any" }}
|
||||||
|
{%- endif %}
|
||||||
|
{%- endmacro %}
|
||||||
|
|
||||||
|
|
||||||
|
{{- bos_token }}
|
||||||
|
{{- '<|im_start|>system
|
||||||
|
' }}
|
||||||
|
{{- "You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools: <tools> " }}
|
||||||
|
{%- for tool in tools %}
|
||||||
|
{%- if tool.function is defined %}
|
||||||
|
{%- set tool = tool.function %}
|
||||||
|
{%- endif %}
|
||||||
|
{{- '{"type": "function", "function": ' }}
|
||||||
|
{{- '{"name": "' + tool.name + '", ' }}
|
||||||
|
{{- '"description": "' + tool.name + '(' }}
|
||||||
|
{%- for param_name, param_fields in tool.parameters.properties|items %}
|
||||||
|
{{- param_name + ": " + json_to_python_type(param_fields) }}
|
||||||
|
{%- if not loop.last %}
|
||||||
|
{{- ", " }}
|
||||||
|
{%- endif %}
|
||||||
|
{%- endfor %}
|
||||||
|
{{- ")" }}
|
||||||
|
{%- if tool.return is defined %}
|
||||||
|
{{- " -> " + json_to_python_type(tool.return) }}
|
||||||
|
{%- endif %}
|
||||||
|
{{- " - " + tool.description + "
|
||||||
|
|
||||||
|
" }}
|
||||||
|
{%- for param_name, param_fields in tool.parameters.properties|items %}
|
||||||
|
{%- if loop.first %}
|
||||||
|
{{- " Args:
|
||||||
|
" }}
|
||||||
|
{%- endif %}
|
||||||
|
{{- " " + param_name + "(" + json_to_python_type(param_fields) + "): " + param_fields.description|trim }}
|
||||||
|
{%- endfor %}
|
||||||
|
{%- if tool.return is defined and tool.return.description is defined %}
|
||||||
|
{{- "
|
||||||
|
Returns:
|
||||||
|
" + tool.return.description }}
|
||||||
|
{%- endif %}
|
||||||
|
{{- '"' }}
|
||||||
|
{{- ', "parameters": ' }}
|
||||||
|
{%- if tool.parameters.properties | length == 0 %}
|
||||||
|
{{- "{}" }}
|
||||||
|
{%- else %}
|
||||||
|
{{- tool.parameters|tojson }}
|
||||||
|
{%- endif %}
|
||||||
|
{{- "}" }}
|
||||||
|
{%- if not loop.last %}
|
||||||
|
{{- "
|
||||||
|
" }}
|
||||||
|
{%- endif %}
|
||||||
|
{%- endfor %}
|
||||||
|
{{- " </tools>" }}
|
||||||
|
{{- 'Use the following pydantic model json schema for each tool call you will make: {"properties": {"name": {"title": "Name", "type": "string"}, "arguments": {"title": "Arguments", "type": "object"}}, "required": ["name", "arguments"], "title": "FunctionCall", "type": "object"}}
|
||||||
|
' }}
|
||||||
|
{{- "For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:
|
||||||
|
" }}
|
||||||
|
{{- "<tool_call>
|
||||||
|
" }}
|
||||||
|
{{- '{"name": <function-name>, "arguments": <args-dict>}
|
||||||
|
' }}
|
||||||
|
{{- '</tool_call><|im_end|>
|
||||||
|
' }}
|
||||||
|
{%- for message in messages %}
|
||||||
|
{%- if message.role == "user" or message.role == "system" or (message.role == "assistant" and message.tool_calls is not defined) %}
|
||||||
|
{{- '<|im_start|>' + message.role + '
|
||||||
|
' + message.content + '<|im_end|>' + '
|
||||||
|
' }}
|
||||||
|
{%- elif message.role == "assistant" %}
|
||||||
|
{{- '<|im_start|>' + message.role }}
|
||||||
|
{%- for tool_call in message.tool_calls %}
|
||||||
|
{{- '
|
||||||
|
<tool_call>
|
||||||
|
' }} {%- if tool_call.function is defined %}
|
||||||
|
{%- set tool_call = tool_call.function %}
|
||||||
|
{%- endif %}
|
||||||
|
{{- '{' }}
|
||||||
|
{{- '"name": "' }}
|
||||||
|
{{- tool_call.name }}
|
||||||
|
{{- '"' }}
|
||||||
|
{{- ', '}}
|
||||||
|
{%- if tool_call.arguments is defined %}
|
||||||
|
{{- '"arguments": ' }}
|
||||||
|
{%- if tool_call.arguments is string %}
|
||||||
|
{{- tool_call.arguments }}
|
||||||
|
{%- else %}
|
||||||
|
{{- tool_call.arguments|tojson }}
|
||||||
|
{%- endif %}
|
||||||
|
{%- endif %}
|
||||||
|
{{- '}' }}
|
||||||
|
{{- '
|
||||||
|
</tool_call>' }}
|
||||||
|
{%- endfor %}
|
||||||
|
{{- '<|im_end|>
|
||||||
|
' }}
|
||||||
|
{%- elif message.role == "tool" %}
|
||||||
|
{%- if loop.previtem and loop.previtem.role != "tool" %}
|
||||||
|
{{- '<|im_start|>tool
|
||||||
|
' }}
|
||||||
|
{%- endif %}
|
||||||
|
{{- '<tool_response>
|
||||||
|
' }}
|
||||||
|
{{- message.content }}
|
||||||
|
{%- if not loop.last %}
|
||||||
|
{{- '
|
||||||
|
</tool_response>
|
||||||
|
' }}
|
||||||
|
{%- else %}
|
||||||
|
{{- '
|
||||||
|
</tool_response>' }}
|
||||||
|
{%- endif %}
|
||||||
|
{%- if not loop.last and loop.nextitem.role != "tool" %}
|
||||||
|
{{- '<|im_end|>' }}
|
||||||
|
{%- elif loop.last %}
|
||||||
|
{{- '<|im_end|>' }}
|
||||||
|
{%- endif %}
|
||||||
|
{%- endif %}
|
||||||
|
{%- endfor %}
|
||||||
|
{%- if add_generation_prompt %}
|
||||||
|
{{- '<|im_start|>assistant
|
||||||
|
' }}
|
||||||
|
{%- endif %}
|
||||||
3
arc_model_card.png
Normal file
3
arc_model_card.png
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:439c9fa4f29df07e2a1c58b30e1824c3d5c3d564a87ac2a4cc4da5f756f72aa0
|
||||||
|
size 132991
|
||||||
6
chat_template.jinja
Normal file
6
chat_template.jinja
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
{{bos_token}}{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system
|
||||||
|
You are a helpful assistant.<|im_end|>
|
||||||
|
' }}{% endif %}{{'<|im_start|>' + message['role'] + '
|
||||||
|
' + message['content'] + '<|im_end|>' + '
|
||||||
|
'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
|
||||||
|
' }}{% endif %}
|
||||||
35
config.json
Normal file
35
config.json
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
{
|
||||||
|
"architectures": [
|
||||||
|
"LlamaForCausalLM"
|
||||||
|
],
|
||||||
|
"attention_bias": false,
|
||||||
|
"attention_dropout": 0.0,
|
||||||
|
"bos_token_id": 128000,
|
||||||
|
"eos_token_id": 128040,
|
||||||
|
"head_dim": 128,
|
||||||
|
"hidden_act": "silu",
|
||||||
|
"hidden_size": 4096,
|
||||||
|
"initializer_range": 0.02,
|
||||||
|
"intermediate_size": 14336,
|
||||||
|
"max_position_embeddings": 131072,
|
||||||
|
"mlp_bias": false,
|
||||||
|
"model_type": "llama",
|
||||||
|
"num_attention_heads": 32,
|
||||||
|
"num_hidden_layers": 32,
|
||||||
|
"num_key_value_heads": 8,
|
||||||
|
"pretraining_tp": 1,
|
||||||
|
"rms_norm_eps": 1e-05,
|
||||||
|
"rope_scaling": {
|
||||||
|
"factor": 8.0,
|
||||||
|
"high_freq_factor": 4.0,
|
||||||
|
"low_freq_factor": 1.0,
|
||||||
|
"original_max_position_embeddings": 8192,
|
||||||
|
"rope_type": "llama3"
|
||||||
|
},
|
||||||
|
"rope_theta": 500000.0,
|
||||||
|
"tie_word_embeddings": false,
|
||||||
|
"torch_dtype": "float16",
|
||||||
|
"transformers_version": "4.55.2",
|
||||||
|
"use_cache": true,
|
||||||
|
"vocab_size": 128256
|
||||||
|
}
|
||||||
3
demo.mp4
Normal file
3
demo.mp4
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:c729090fea7c55841876734e9da6c8d0c444b49bf9c8e820e3a417c1a234f63e
|
||||||
|
size 12554193
|
||||||
9
generation_config.json
Normal file
9
generation_config.json
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
{
|
||||||
|
"_from_model_config": true,
|
||||||
|
"bos_token_id": 128000,
|
||||||
|
"do_sample": true,
|
||||||
|
"eos_token_id": 128040,
|
||||||
|
"temperature": 0.6,
|
||||||
|
"top_p": 0.9,
|
||||||
|
"transformers_version": "4.55.2"
|
||||||
|
}
|
||||||
3
hedging_head.pt
Normal file
3
hedging_head.pt
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:a43d30ab3e87d8e7dc70c62da5ca5b49f54e272713969e87c5f3a742e485871d
|
||||||
|
size 24186
|
||||||
631
inference.py
Normal file
631
inference.py
Normal file
@@ -0,0 +1,631 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
ARC-8B: Adaptive Repetition Controller
|
||||||
|
=======================================
|
||||||
|
Decode-time behavioral control for language models.
|
||||||
|
|
||||||
|
This script loads the complete ARC system and runs inference with
|
||||||
|
multi-head cognitive control that detects and suppresses:
|
||||||
|
- Repetition loops (125× separation)
|
||||||
|
- Hedging phrases (1.5× separation)
|
||||||
|
- Verbosity/filler (2.1× separation)
|
||||||
|
- Sycophancy (experimental)
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python inference.py # Interactive mode
|
||||||
|
python inference.py --prompt "Hello" # Single prompt
|
||||||
|
python inference.py --no-arc # Disable ARC (baseline)
|
||||||
|
|
||||||
|
Requirements:
|
||||||
|
pip install torch transformers accelerate bitsandbytes
|
||||||
|
|
||||||
|
Model: LoganResearch/ARC-Base-8B (16GB, runs in ~10GB with 4-bit)
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import argparse
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
import torch.nn.functional as F
|
||||||
|
from typing import Dict, List, Optional, Tuple
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# CONFIGURATION
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ARCConfig:
|
||||||
|
"""ARC System Configuration"""
|
||||||
|
# Model
|
||||||
|
model_id: str = "LoganResearch/ARC-Base-8B"
|
||||||
|
load_in_4bit: bool = True
|
||||||
|
load_in_8bit: bool = False
|
||||||
|
device_map: str = "auto"
|
||||||
|
|
||||||
|
# Architecture (must match training)
|
||||||
|
d_model: int = 4096
|
||||||
|
n_layers: int = 32
|
||||||
|
d_fiber: int = 16
|
||||||
|
d_control: int = 64
|
||||||
|
|
||||||
|
# Intervention thresholds (tuned empirically)
|
||||||
|
repetition_threshold: float = 0.70
|
||||||
|
hedging_threshold: float = 0.60
|
||||||
|
verbosity_threshold: float = 0.65
|
||||||
|
sycophancy_threshold: float = 0.60
|
||||||
|
|
||||||
|
# Intervention penalties
|
||||||
|
repetition_penalty: float = 5.0
|
||||||
|
hedging_penalty: float = 3.0
|
||||||
|
verbosity_penalty: float = 2.0
|
||||||
|
sycophancy_penalty: float = 2.0
|
||||||
|
|
||||||
|
# Generation
|
||||||
|
max_new_tokens: int = 512
|
||||||
|
temperature: float = 0.8
|
||||||
|
top_p: float = 0.92
|
||||||
|
repetition_window: int = 32
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# MULTI-HEAD PREDICTOR
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
class MultiHeadPredictor(nn.Module):
|
||||||
|
"""
|
||||||
|
Prediction heads that monitor hidden states and detect behavioral patterns.
|
||||||
|
|
||||||
|
The system uses shared "fiber projections" that compress hidden states,
|
||||||
|
then individual heads that predict risk scores for specific behaviors.
|
||||||
|
|
||||||
|
Architecture:
|
||||||
|
Hidden States [n_layers × d_model]
|
||||||
|
→ Fiber Projections [n_layers × d_fiber]
|
||||||
|
→ Weighted Aggregation [d_fiber]
|
||||||
|
→ Per-Head MLP → Risk Score [0-1]
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, config: ARCConfig):
|
||||||
|
super().__init__()
|
||||||
|
self.config = config
|
||||||
|
|
||||||
|
# Shared fiber projections (learned during CF-HoT training)
|
||||||
|
self.fiber_projs = nn.ModuleList([
|
||||||
|
nn.Linear(config.d_model, config.d_fiber, bias=False)
|
||||||
|
for _ in range(config.n_layers)
|
||||||
|
])
|
||||||
|
|
||||||
|
# Learned layer importance weights
|
||||||
|
self.layer_weights = nn.Parameter(torch.ones(config.n_layers) / config.n_layers)
|
||||||
|
|
||||||
|
# Individual prediction heads
|
||||||
|
self.heads = nn.ModuleDict()
|
||||||
|
self.loaded_heads: set = set()
|
||||||
|
|
||||||
|
def _make_head(self) -> nn.Sequential:
|
||||||
|
"""Create a prediction head: fiber features → risk score"""
|
||||||
|
return nn.Sequential(
|
||||||
|
nn.Linear(self.config.d_fiber, self.config.d_control),
|
||||||
|
nn.GELU(),
|
||||||
|
nn.Linear(self.config.d_control, self.config.d_control),
|
||||||
|
nn.GELU(),
|
||||||
|
nn.Linear(self.config.d_control, 1)
|
||||||
|
)
|
||||||
|
|
||||||
|
def add_head(self, name: str) -> None:
|
||||||
|
"""Add a new prediction head"""
|
||||||
|
self.heads[name] = self._make_head()
|
||||||
|
|
||||||
|
def get_fiber_features(self, hidden_states: List[torch.Tensor]) -> torch.Tensor:
|
||||||
|
"""
|
||||||
|
Project hidden states through fiber projections and aggregate.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
hidden_states: List of [batch, seq, d_model] tensors from each layer
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Aggregated features [batch, seq, d_fiber]
|
||||||
|
"""
|
||||||
|
device = hidden_states[0].device
|
||||||
|
fibers = []
|
||||||
|
for i, (proj, hidden) in enumerate(zip(self.fiber_projs, hidden_states)):
|
||||||
|
if i < len(hidden_states):
|
||||||
|
proj = proj.to(device)
|
||||||
|
fibers.append(proj(hidden.float()))
|
||||||
|
|
||||||
|
# Weighted sum across layers
|
||||||
|
weights = F.softmax(self.layer_weights.to(device)[:len(fibers)], dim=0)
|
||||||
|
aggregated = sum(w * f for w, f in zip(weights, fibers))
|
||||||
|
return aggregated
|
||||||
|
|
||||||
|
def get_risk(self, head_name: str, hidden_states: List[torch.Tensor]) -> torch.Tensor:
|
||||||
|
"""Get risk score from a specific head"""
|
||||||
|
if head_name not in self.loaded_heads:
|
||||||
|
return torch.zeros(1, device=hidden_states[0].device)
|
||||||
|
|
||||||
|
features = self.get_fiber_features(hidden_states)
|
||||||
|
logits = self.heads[head_name](features).squeeze(-1)
|
||||||
|
return torch.sigmoid(logits)
|
||||||
|
|
||||||
|
def get_all_risks(self, hidden_states: List[torch.Tensor]) -> Dict[str, torch.Tensor]:
|
||||||
|
"""Get risk scores from all loaded heads"""
|
||||||
|
if not self.loaded_heads:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
device = hidden_states[0].device
|
||||||
|
features = self.get_fiber_features(hidden_states)
|
||||||
|
risks = {}
|
||||||
|
for name in self.loaded_heads:
|
||||||
|
self.heads[name] = self.heads[name].to(device)
|
||||||
|
logits = self.heads[name](features).squeeze(-1)
|
||||||
|
risks[name] = torch.sigmoid(logits)
|
||||||
|
return risks
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# ARC SYSTEM
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
class ARCSystem:
|
||||||
|
"""
|
||||||
|
Complete ARC (Adaptive Repetition Controller) System
|
||||||
|
|
||||||
|
Loads model + prediction heads and provides controlled generation
|
||||||
|
with real-time behavioral intervention.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Tokens to suppress for each behavior type
|
||||||
|
HEDGE_STARTERS = [
|
||||||
|
"As", "I'm", "I", "It's", "While", "Although", "However",
|
||||||
|
"That", "This", "Please", "Well", "So", "Actually"
|
||||||
|
]
|
||||||
|
VERBOSE_STARTERS = [
|
||||||
|
"Let", "Basically", "Essentially", "Simply", "Indeed",
|
||||||
|
"Furthermore", "Moreover", "Additionally", "Firstly"
|
||||||
|
]
|
||||||
|
SYCOPHANCY_STARTERS = [
|
||||||
|
"Great", "Excellent", "Wonderful", "Absolutely", "Of",
|
||||||
|
"Thank", "Sure", "Certainly", "Definitely"
|
||||||
|
]
|
||||||
|
|
||||||
|
def __init__(self, config: Optional[ARCConfig] = None):
|
||||||
|
self.config = config or ARCConfig()
|
||||||
|
|
||||||
|
self.model = None
|
||||||
|
self.tokenizer = None
|
||||||
|
self.predictor = None
|
||||||
|
|
||||||
|
# Token ID caches for suppression
|
||||||
|
self._hedge_token_ids: set = set()
|
||||||
|
self._verbose_token_ids: set = set()
|
||||||
|
self._sycophancy_token_ids: set = set()
|
||||||
|
|
||||||
|
# Stats
|
||||||
|
self.total_interventions = {"repetition": 0, "hedging": 0, "verbosity": 0, "sycophancy": 0}
|
||||||
|
|
||||||
|
def load(self, verbose: bool = True) -> "ARCSystem":
|
||||||
|
"""
|
||||||
|
Load all components from HuggingFace.
|
||||||
|
|
||||||
|
Downloads and initializes:
|
||||||
|
1. Base model (Hermes-3-Llama-3.1-8B based)
|
||||||
|
2. Tokenizer
|
||||||
|
3. Prediction heads (repetition, hedging, verbosity, sycophancy)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
self (for chaining)
|
||||||
|
"""
|
||||||
|
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
|
||||||
|
from huggingface_hub import hf_hub_download
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print("=" * 60)
|
||||||
|
print(" ARC-8B: Adaptive Repetition Controller")
|
||||||
|
print(" Decode-time behavioral control system")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
# === 1. Tokenizer ===
|
||||||
|
if verbose:
|
||||||
|
print("\n[1/4] Loading tokenizer...")
|
||||||
|
self.tokenizer = AutoTokenizer.from_pretrained(
|
||||||
|
self.config.model_id,
|
||||||
|
trust_remote_code=True
|
||||||
|
)
|
||||||
|
if self.tokenizer.pad_token is None:
|
||||||
|
self.tokenizer.pad_token = self.tokenizer.eos_token
|
||||||
|
|
||||||
|
# === 2. Model ===
|
||||||
|
if verbose:
|
||||||
|
print("[2/4] Loading model...")
|
||||||
|
if self.config.load_in_4bit:
|
||||||
|
print(" (4-bit quantization enabled)")
|
||||||
|
|
||||||
|
quantization_config = None
|
||||||
|
if self.config.load_in_4bit:
|
||||||
|
quantization_config = BitsAndBytesConfig(
|
||||||
|
load_in_4bit=True,
|
||||||
|
bnb_4bit_compute_dtype=torch.float16,
|
||||||
|
bnb_4bit_use_double_quant=True,
|
||||||
|
bnb_4bit_quant_type="nf4"
|
||||||
|
)
|
||||||
|
elif self.config.load_in_8bit:
|
||||||
|
quantization_config = BitsAndBytesConfig(load_in_8bit=True)
|
||||||
|
|
||||||
|
self.model = AutoModelForCausalLM.from_pretrained(
|
||||||
|
self.config.model_id,
|
||||||
|
quantization_config=quantization_config,
|
||||||
|
device_map=self.config.device_map,
|
||||||
|
torch_dtype=torch.float16,
|
||||||
|
trust_remote_code=True
|
||||||
|
)
|
||||||
|
self.model.eval()
|
||||||
|
|
||||||
|
# === 3. Prediction Heads ===
|
||||||
|
if verbose:
|
||||||
|
print("[3/4] Loading prediction heads...")
|
||||||
|
|
||||||
|
device = next(self.model.parameters()).device
|
||||||
|
self.predictor = MultiHeadPredictor(self.config).to(device).float()
|
||||||
|
|
||||||
|
# Load risk_predictor.pt (contains fiber projections + repetition head)
|
||||||
|
try:
|
||||||
|
risk_path = hf_hub_download(self.config.model_id, "risk_predictor.pt")
|
||||||
|
ckpt = torch.load(risk_path, map_location=device, weights_only=False)
|
||||||
|
|
||||||
|
# The checkpoint contains the full state dict
|
||||||
|
state = ckpt.get('risk_predictor', ckpt)
|
||||||
|
|
||||||
|
# Load fiber projections
|
||||||
|
for i in range(self.config.n_layers):
|
||||||
|
key = f'fiber_projs.{i}.weight'
|
||||||
|
if key in state:
|
||||||
|
self.predictor.fiber_projs[i].weight.data = state[key].to(device).float()
|
||||||
|
|
||||||
|
# Load layer weights
|
||||||
|
if 'layer_weights' in state:
|
||||||
|
self.predictor.layer_weights.data = state['layer_weights'].to(device).float()
|
||||||
|
|
||||||
|
# Load repetition head
|
||||||
|
self.predictor.add_head('repetition')
|
||||||
|
self.predictor.heads['repetition'][0].weight.data = state['predictor.0.weight'].to(device).float()
|
||||||
|
self.predictor.heads['repetition'][0].bias.data = state['predictor.0.bias'].to(device).float()
|
||||||
|
self.predictor.heads['repetition'][2].weight.data = state['predictor.2.weight'].to(device).float()
|
||||||
|
self.predictor.heads['repetition'][2].bias.data = state['predictor.2.bias'].to(device).float()
|
||||||
|
self.predictor.heads['repetition'][4].weight.data = state['predictor.4.weight'].to(device).float()
|
||||||
|
self.predictor.heads['repetition'][4].bias.data = state['predictor.4.bias'].to(device).float()
|
||||||
|
self.predictor.loaded_heads.add('repetition')
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print(" ✓ Repetition head (125× separation)")
|
||||||
|
except Exception as e:
|
||||||
|
if verbose:
|
||||||
|
print(f" ✗ Repetition head: {e}")
|
||||||
|
|
||||||
|
# Load additional heads
|
||||||
|
for head_name in ['hedging', 'verbosity', 'sycophancy']:
|
||||||
|
try:
|
||||||
|
head_path = hf_hub_download(self.config.model_id, f"{head_name}_head.pt")
|
||||||
|
ckpt = torch.load(head_path, map_location=device, weights_only=False)
|
||||||
|
|
||||||
|
self.predictor.add_head(head_name)
|
||||||
|
head_state = ckpt.get('head_state', ckpt)
|
||||||
|
self.predictor.heads[head_name].load_state_dict(head_state)
|
||||||
|
self.predictor.loaded_heads.add(head_name)
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print(f" ✓ {head_name.capitalize()} head")
|
||||||
|
except Exception as e:
|
||||||
|
if verbose:
|
||||||
|
print(f" ✗ {head_name.capitalize()} head: {e}")
|
||||||
|
|
||||||
|
self.predictor.eval()
|
||||||
|
|
||||||
|
# === 4. Build Token Suppression Sets ===
|
||||||
|
if verbose:
|
||||||
|
print("[4/4] Building suppression vocabularies...")
|
||||||
|
|
||||||
|
self._build_suppression_sets()
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print(f" ✓ ARC System Ready")
|
||||||
|
print(f" Active heads: {list(self.predictor.loaded_heads)}")
|
||||||
|
print("=" * 60 + "\n")
|
||||||
|
|
||||||
|
return self
|
||||||
|
|
||||||
|
def _build_suppression_sets(self) -> None:
|
||||||
|
"""Build token ID sets for behavioral suppression"""
|
||||||
|
for word in self.HEDGE_STARTERS:
|
||||||
|
tokens = self.tokenizer.encode(word, add_special_tokens=False)
|
||||||
|
if tokens:
|
||||||
|
self._hedge_token_ids.add(tokens[0])
|
||||||
|
|
||||||
|
for word in self.VERBOSE_STARTERS:
|
||||||
|
tokens = self.tokenizer.encode(word, add_special_tokens=False)
|
||||||
|
if tokens:
|
||||||
|
self._verbose_token_ids.add(tokens[0])
|
||||||
|
|
||||||
|
for word in self.SYCOPHANCY_STARTERS:
|
||||||
|
tokens = self.tokenizer.encode(word, add_special_tokens=False)
|
||||||
|
if tokens:
|
||||||
|
self._sycophancy_token_ids.add(tokens[0])
|
||||||
|
|
||||||
|
def _apply_interventions(
|
||||||
|
self,
|
||||||
|
logits: torch.Tensor,
|
||||||
|
risks: Dict[str, torch.Tensor],
|
||||||
|
recent_tokens: List[int]
|
||||||
|
) -> Tuple[torch.Tensor, Dict[str, bool]]:
|
||||||
|
"""
|
||||||
|
Apply behavioral interventions based on risk scores.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
logits: [1, vocab_size] logits for next token
|
||||||
|
risks: Dict of risk scores for each head
|
||||||
|
recent_tokens: Recently generated token IDs
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Modified logits and dict of which interventions fired
|
||||||
|
"""
|
||||||
|
interventions = {}
|
||||||
|
|
||||||
|
# Repetition: suppress recently used tokens
|
||||||
|
if risks.get('repetition', 0) > self.config.repetition_threshold:
|
||||||
|
for tok in set(recent_tokens[-self.config.repetition_window:]):
|
||||||
|
logits[0, tok] -= self.config.repetition_penalty
|
||||||
|
interventions['repetition'] = True
|
||||||
|
self.total_interventions['repetition'] += 1
|
||||||
|
|
||||||
|
# Hedging: suppress hedge phrase starters
|
||||||
|
if risks.get('hedging', 0) > self.config.hedging_threshold:
|
||||||
|
for tok in self._hedge_token_ids:
|
||||||
|
logits[0, tok] -= self.config.hedging_penalty
|
||||||
|
interventions['hedging'] = True
|
||||||
|
self.total_interventions['hedging'] += 1
|
||||||
|
|
||||||
|
# Verbosity: suppress filler phrase starters
|
||||||
|
if risks.get('verbosity', 0) > self.config.verbosity_threshold:
|
||||||
|
for tok in self._verbose_token_ids:
|
||||||
|
logits[0, tok] -= self.config.verbosity_penalty
|
||||||
|
interventions['verbosity'] = True
|
||||||
|
self.total_interventions['verbosity'] += 1
|
||||||
|
|
||||||
|
# Sycophancy: suppress sycophantic starters
|
||||||
|
if risks.get('sycophancy', 0) > self.config.sycophancy_threshold:
|
||||||
|
for tok in self._sycophancy_token_ids:
|
||||||
|
logits[0, tok] -= self.config.sycophancy_penalty
|
||||||
|
interventions['sycophancy'] = True
|
||||||
|
self.total_interventions['sycophancy'] += 1
|
||||||
|
|
||||||
|
return logits, interventions
|
||||||
|
|
||||||
|
def generate(
|
||||||
|
self,
|
||||||
|
prompt: str,
|
||||||
|
system_prompt: Optional[str] = None,
|
||||||
|
max_new_tokens: Optional[int] = None,
|
||||||
|
temperature: Optional[float] = None,
|
||||||
|
use_arc: bool = True,
|
||||||
|
verbose: bool = False
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
Generate text with optional ARC behavioral control.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
prompt: User input
|
||||||
|
system_prompt: Optional system message
|
||||||
|
max_new_tokens: Max tokens to generate (default: config value)
|
||||||
|
temperature: Sampling temperature (default: config value)
|
||||||
|
use_arc: Whether to use ARC intervention (default: True)
|
||||||
|
verbose: Print intervention info (default: False)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Generated text
|
||||||
|
"""
|
||||||
|
max_new_tokens = max_new_tokens or self.config.max_new_tokens
|
||||||
|
temperature = temperature or self.config.temperature
|
||||||
|
|
||||||
|
# Build chat format
|
||||||
|
if system_prompt is None:
|
||||||
|
system_prompt = "You are a helpful assistant."
|
||||||
|
|
||||||
|
full_prompt = f"<|im_start|>system\n{system_prompt}<|im_end|>\n"
|
||||||
|
full_prompt += f"<|im_start|>user\n{prompt}<|im_end|>\n"
|
||||||
|
full_prompt += "<|im_start|>assistant\n"
|
||||||
|
|
||||||
|
device = next(self.model.parameters()).device
|
||||||
|
input_ids = self.tokenizer.encode(full_prompt, return_tensors='pt').to(device)
|
||||||
|
attention_mask = torch.ones_like(input_ids)
|
||||||
|
|
||||||
|
generated_ids = input_ids.clone()
|
||||||
|
intervention_counts = {"repetition": 0, "hedging": 0, "verbosity": 0, "sycophancy": 0}
|
||||||
|
|
||||||
|
# Generation loop
|
||||||
|
for step in range(max_new_tokens):
|
||||||
|
with torch.no_grad():
|
||||||
|
outputs = self.model(
|
||||||
|
input_ids=generated_ids,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
output_hidden_states=True,
|
||||||
|
return_dict=True
|
||||||
|
)
|
||||||
|
|
||||||
|
logits = outputs.logits[:, -1, :] / temperature
|
||||||
|
|
||||||
|
# ARC intervention
|
||||||
|
if use_arc and self.predictor.loaded_heads:
|
||||||
|
hidden_states = outputs.hidden_states[1:] # Skip embedding layer
|
||||||
|
risks = self.predictor.get_all_risks(hidden_states)
|
||||||
|
current_risks = {name: r[:, -1].item() for name, r in risks.items()}
|
||||||
|
|
||||||
|
recent = generated_ids[0, -self.config.repetition_window:].tolist()
|
||||||
|
logits, fired = self._apply_interventions(logits, current_risks, recent)
|
||||||
|
|
||||||
|
for k, v in fired.items():
|
||||||
|
if v:
|
||||||
|
intervention_counts[k] += 1
|
||||||
|
|
||||||
|
# Top-p sampling
|
||||||
|
sorted_logits, sorted_indices = torch.sort(logits, descending=True)
|
||||||
|
cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
|
||||||
|
sorted_indices_to_remove = cumulative_probs > self.config.top_p
|
||||||
|
sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
|
||||||
|
sorted_indices_to_remove[..., 0] = 0
|
||||||
|
indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
|
||||||
|
logits[indices_to_remove] = float('-inf')
|
||||||
|
|
||||||
|
probs = F.softmax(logits, dim=-1)
|
||||||
|
next_token = torch.multinomial(probs, num_samples=1)
|
||||||
|
|
||||||
|
generated_ids = torch.cat([generated_ids, next_token], dim=-1)
|
||||||
|
attention_mask = torch.cat([attention_mask, torch.ones(1, 1, device=device)], dim=-1)
|
||||||
|
|
||||||
|
# Check for EOS
|
||||||
|
if next_token.item() == self.tokenizer.eos_token_id:
|
||||||
|
break
|
||||||
|
|
||||||
|
# Check for end of turn
|
||||||
|
if next_token.item() == self.tokenizer.encode("<|im_end|>", add_special_tokens=False)[0]:
|
||||||
|
break
|
||||||
|
|
||||||
|
# Decode response
|
||||||
|
full_output = self.tokenizer.decode(generated_ids[0], skip_special_tokens=False)
|
||||||
|
|
||||||
|
# Extract assistant response
|
||||||
|
if "<|im_start|>assistant\n" in full_output:
|
||||||
|
response = full_output.split("<|im_start|>assistant\n")[-1]
|
||||||
|
if "<|im_end|>" in response:
|
||||||
|
response = response.split("<|im_end|>")[0]
|
||||||
|
else:
|
||||||
|
response = full_output
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
total = sum(intervention_counts.values())
|
||||||
|
print(f"\n[ARC Stats] Interventions: {total} total")
|
||||||
|
for k, v in intervention_counts.items():
|
||||||
|
if v > 0:
|
||||||
|
print(f" - {k}: {v}")
|
||||||
|
|
||||||
|
return response.strip()
|
||||||
|
|
||||||
|
def chat(self, system_prompt: Optional[str] = None) -> None:
|
||||||
|
"""
|
||||||
|
Interactive chat mode.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
system_prompt: Optional system message
|
||||||
|
"""
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print(" ARC-8B Interactive Chat")
|
||||||
|
print(" Commands: /quit, /stats, /arc on|off, /clear")
|
||||||
|
print("=" * 60 + "\n")
|
||||||
|
|
||||||
|
use_arc = True
|
||||||
|
history = []
|
||||||
|
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
user_input = input("You: ").strip()
|
||||||
|
except (KeyboardInterrupt, EOFError):
|
||||||
|
print("\nGoodbye!")
|
||||||
|
break
|
||||||
|
|
||||||
|
if not user_input:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Commands
|
||||||
|
if user_input.lower() == '/quit':
|
||||||
|
print("Goodbye!")
|
||||||
|
break
|
||||||
|
elif user_input.lower() == '/stats':
|
||||||
|
print(f"\nTotal interventions: {self.total_interventions}\n")
|
||||||
|
continue
|
||||||
|
elif user_input.lower() == '/arc on':
|
||||||
|
use_arc = True
|
||||||
|
print("ARC enabled\n")
|
||||||
|
continue
|
||||||
|
elif user_input.lower() == '/arc off':
|
||||||
|
use_arc = False
|
||||||
|
print("ARC disabled (baseline mode)\n")
|
||||||
|
continue
|
||||||
|
elif user_input.lower() == '/clear':
|
||||||
|
history = []
|
||||||
|
self.total_interventions = {k: 0 for k in self.total_interventions}
|
||||||
|
print("History cleared\n")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Generate response
|
||||||
|
response = self.generate(
|
||||||
|
user_input,
|
||||||
|
system_prompt=system_prompt,
|
||||||
|
use_arc=use_arc,
|
||||||
|
verbose=True
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"\nAssistant: {response}\n")
|
||||||
|
history.append({"user": user_input, "assistant": response})
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# MAIN
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="ARC-8B: Adaptive Repetition Controller",
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
|
epilog="""
|
||||||
|
Examples:
|
||||||
|
python inference.py # Interactive chat
|
||||||
|
python inference.py --prompt "Hello" # Single prompt
|
||||||
|
python inference.py --no-arc # Disable ARC (baseline)
|
||||||
|
python inference.py --8bit # Use 8-bit quantization
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
parser.add_argument("--prompt", "-p", type=str, help="Single prompt to process")
|
||||||
|
parser.add_argument("--system", "-s", type=str, help="System prompt")
|
||||||
|
parser.add_argument("--no-arc", action="store_true", help="Disable ARC intervention")
|
||||||
|
parser.add_argument("--4bit", dest="load_4bit", action="store_true", default=True, help="Use 4-bit quantization (default)")
|
||||||
|
parser.add_argument("--8bit", dest="load_8bit", action="store_true", help="Use 8-bit quantization")
|
||||||
|
parser.add_argument("--no-quant", action="store_true", help="Disable quantization (requires ~32GB VRAM)")
|
||||||
|
parser.add_argument("--max-tokens", type=int, default=512, help="Max tokens to generate")
|
||||||
|
parser.add_argument("--temperature", type=float, default=0.8, help="Sampling temperature")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Configure
|
||||||
|
config = ARCConfig(
|
||||||
|
max_new_tokens=args.max_tokens,
|
||||||
|
temperature=args.temperature
|
||||||
|
)
|
||||||
|
|
||||||
|
if args.load_8bit:
|
||||||
|
config.load_in_4bit = False
|
||||||
|
config.load_in_8bit = True
|
||||||
|
elif args.no_quant:
|
||||||
|
config.load_in_4bit = False
|
||||||
|
config.load_in_8bit = False
|
||||||
|
|
||||||
|
# Load
|
||||||
|
arc = ARCSystem(config)
|
||||||
|
arc.load()
|
||||||
|
|
||||||
|
# Run
|
||||||
|
if args.prompt:
|
||||||
|
response = arc.generate(
|
||||||
|
args.prompt,
|
||||||
|
system_prompt=args.system,
|
||||||
|
use_arc=not args.no_arc,
|
||||||
|
verbose=True
|
||||||
|
)
|
||||||
|
print(f"\n{response}\n")
|
||||||
|
else:
|
||||||
|
arc.chat(system_prompt=args.system)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
3
model-00001-of-00004.safetensors
Normal file
3
model-00001-of-00004.safetensors
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:c65acba055624759f3844e6b553e503b28b6362302b5800a3363e7b9d0651477
|
||||||
|
size 4976698592
|
||||||
3
model-00002-of-00004.safetensors
Normal file
3
model-00002-of-00004.safetensors
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:1f7be5ec6b07d6a9f2bb2fff3b5ad8532ac1d24a0abb208a3c4f68408938202d
|
||||||
|
size 4999802616
|
||||||
3
model-00003-of-00004.safetensors
Normal file
3
model-00003-of-00004.safetensors
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:37aa5379bee102bd524ab56428aba4fd735645ba0f376fb37b8b3d5923be45cd
|
||||||
|
size 4915916080
|
||||||
3
model-00004-of-00004.safetensors
Normal file
3
model-00004-of-00004.safetensors
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:fae222101e3eec8ebef0ed6fbeaebec1b436d4c9f7d37cba9cdf44fc3a86e6a7
|
||||||
|
size 1168138808
|
||||||
299
model.safetensors.index.json
Normal file
299
model.safetensors.index.json
Normal file
@@ -0,0 +1,299 @@
|
|||||||
|
{
|
||||||
|
"metadata": {
|
||||||
|
"total_parameters": 8030261248,
|
||||||
|
"total_size": 16060522496
|
||||||
|
},
|
||||||
|
"weight_map": {
|
||||||
|
"lm_head.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.embed_tokens.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.31.input_layernorm.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.31.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.31.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.norm.weight": "model-00004-of-00004.safetensors"
|
||||||
|
}
|
||||||
|
}
|
||||||
3
risk_predictor.pt
Normal file
3
risk_predictor.pt
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:6ea75a1a8b408dadc229b464d0e1f131af33a3a974efa523ba9aad2780625fb3
|
||||||
|
size 8424206
|
||||||
23
special_tokens_map.json
Normal file
23
special_tokens_map.json
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
{
|
||||||
|
"bos_token": {
|
||||||
|
"content": "<|begin_of_text|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false
|
||||||
|
},
|
||||||
|
"eos_token": {
|
||||||
|
"content": "<|im_end|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false
|
||||||
|
},
|
||||||
|
"pad_token": {
|
||||||
|
"content": "<|im_end|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false
|
||||||
|
}
|
||||||
|
}
|
||||||
3
sycophancy_head.pt
Normal file
3
sycophancy_head.pt
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:a410f16a20edcf2d1b5609c74e39bccae4d7ed0c7007b0eb15a39db984ba98e6
|
||||||
|
size 24216
|
||||||
3
tokenizer.json
Normal file
3
tokenizer.json
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:b637ba15306725e16ef8ab8570ec57fec66845b810ed4d4c2583564d79b0c158
|
||||||
|
size 17209680
|
||||||
2070
tokenizer_config.json
Normal file
2070
tokenizer_config.json
Normal file
File diff suppressed because it is too large
Load Diff
3
verbosity_head.pt
Normal file
3
verbosity_head.pt
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:ba1118f564de6f41db58f48a44141cea2800a490e7b9f9646414c713af49dadb
|
||||||
|
size 24206
|
||||||
Reference in New Issue
Block a user