初始化项目,由ModelHub XC社区提供模型

Model: ProthamD/adaptive-world-grpo-qwen2.5-3b
Source: Original Platform
This commit is contained in:
ModelHub XC
2026-05-01 13:30:11 +08:00
commit 2610e5a267
26 changed files with 155437 additions and 0 deletions

565
adaptive.ipynb Normal file
View File

@@ -0,0 +1,565 @@
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import subprocess
# Install unsloth
subprocess.run('pip install -q "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"', shell=True)
subprocess.run('pip install -q --upgrade --no-cache-dir --no-deps unsloth_zoo', shell=True)
# Verify
r = subprocess.run("pip show unsloth unsloth_zoo | grep -E 'Name|Version'", shell=True, capture_output=True, text=True)
print(r.stdout)
print("✓ Done — now run Cell 1 (imports)")
import subprocess
r = subprocess.run("pip show unsloth | grep Version", shell=True, capture_output=True, text=True)
print(r.stdout)
print(r.stderr)
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
print("✓ Single GPU forced")
import unsloth
import torch
print(f"✓ unsloth OK")
print(f"✓ torch {torch.__version__} cuda={torch.cuda.is_available()}")
import os
os.kill(os.getpid(), 9)
import json, re, time
import numpy as np
import matplotlib.pyplot as plt
import httpx
import torch
from unsloth import FastLanguageModel
from trl import GRPOConfig, GRPOTrainer
from datasets import Dataset
ENV_URL = "https://prothamd-prothamd-adaptive-world-env.hf.space"
MODEL_NAME = "Qwen/Qwen2.5-3B-Instruct"
print("✓ All imports OK")
print(f"✓ GPU: {torch.cuda.get_device_name(0)}")
def env_reset(scenario_id="auto", difficulty="easy"):
with httpx.Client(base_url=ENV_URL, timeout=30) as c:
r = c.post("/reset", json={"scenario_id": scenario_id, "difficulty": difficulty})
r.raise_for_status()
return r.json()
def env_step(action):
with httpx.Client(base_url=ENV_URL, timeout=30) as c:
r = c.post("/step", json={"action": action})
r.raise_for_status()
return r.json()
def env_run_episode(actions: list, scenario_id="auto", difficulty="easy"):
"""
Run a full episode in ONE env instance — drift accumulates correctly.
This is the correct endpoint to use. /reset+/step are stateless and lose drift.
"""
with httpx.Client(base_url=ENV_URL, timeout=60) as c:
r = c.post("/run_episode", json={
"scenario_id": scenario_id,
"difficulty": difficulty,
"actions": actions,
})
r.raise_for_status()
return r.json()
def env_health():
try:
with httpx.Client(base_url=ENV_URL, timeout=10) as c:
data = c.get("/health").json()
print(f"✓ Health: {data}")
return True
except Exception as e:
print(f"✗ Health failed: {e}")
return False
for attempt in range(5):
if env_health():
break
print(f" Attempt {attempt+1}/5 — waiting...")
time.sleep(12)
def env_start_episode(task_action: dict, scenario_id="auto", difficulty="easy") -> dict:
"""Phase 1: run pre-drift + probe, get probe evidence. Returns session data."""
with httpx.Client(base_url=ENV_URL, timeout=60) as c:
r = c.post("/start_episode", json={
"scenario_id": scenario_id,
"difficulty": difficulty,
"task_action": task_action,
})
r.raise_for_status()
return r.json()
def env_finish_episode(session_id: str, task_action: dict, belief_state: dict) -> dict:
"""Phase 2: run corrected call + submit belief, get final scores."""
with httpx.Client(base_url=ENV_URL, timeout=60) as c:
r = c.post("/finish_episode", json={
"session_id": session_id,
"task_action": task_action,
"belief_state": belief_state,
})
r.raise_for_status()
return r.json()
task_rewards_log = []
belief_accuracy_log = []
combined_rewards_log = []
training_steps_log = []
TRAINING_STEP = 0
print("✓ Tracking initialized")
def parse_action(text):
"""Extract first JSON from model output (the task action)."""
if isinstance(text, list):
text = text[-1].get('content', '') if isinstance(text[-1], dict) else str(text[-1])
text = str(text).strip()
text = re.sub(r'```(?:json)?', '', text).strip().rstrip('`').strip()
depth, start = 0, None
for i, ch in enumerate(text):
if ch == '{':
if depth == 0: start = i
depth += 1
elif ch == '}':
depth -= 1
if depth == 0 and start is not None:
try: return json.loads(text[start:i+1])
except: pass
start = None
return {'action_type': 'call_api', 'method': 'POST',
'url': '/mock_api/orders', 'body': {'qty': 2, 'product_id': 5}}
def generate_belief(completion: str, probe_response: str, history_response: str) -> dict:
action = parse_action(completion)
# Best case: model already output belief_state with correct fields
if "belief_state" in action:
b = action["belief_state"]
if "order_field" in b:
return b
# Parse probe — it contains ground truth field names
try:
probe = json.loads(probe_response) if isinstance(probe_response, str) else probe_response
except Exception:
probe = {}
body_keys = list(action.get("body", {}).keys())
url = action.get("url", "/mock_api/orders")
return {
"order_field": probe.get("order_field", body_keys[0] if body_keys else "qty"),
"required_extra": probe.get("required_extra", None),
"claims_id_field": probe.get("claims_id_field", None),
"claims_amount_field": probe.get("claims_amount_field", None),
"claims_endpoint": probe.get("claims_endpoint", None),
"endpoint": url,
"rooms_endpoint": probe.get("rooms_endpoint", None),
"drift_detected": probe.get("order_field") != (body_keys[0] if body_keys else "qty"),
"order_status": probe.get("order_status", None),
"order_status_value": probe.get("order_status", None),
"product_key": probe.get("product_key", "products"),
"price_field": probe.get("product_price_field", "price"),
"product_price_field": probe.get("product_price_field", "price"),
"id_field": probe.get("product_id_field", "id"),
"product_id_field": probe.get("product_id_field", "id"),
"membership_tier": probe.get("membership_tier", None),
"discount_rate": probe.get("discount_rate", None),
}
def run_full_episode(completion: str, scenario: str, difficulty: str = 'easy') -> tuple:
task_action = parse_action(completion)
try:
start_data = env_start_episode(task_action, difficulty=difficulty)
except Exception:
return 0.0, 0.0, 0.0
session_id = start_data['session_id']
probe_response = start_data.get('probe_response', '')
history_response = start_data.get('history_response', '')
# Parse probe to fix task_action with correct field names
try:
probe = json.loads(probe_response) if isinstance(probe_response, str) else probe_response
except Exception:
probe = {}
# Correct the task action using probe evidence
if probe.get("order_field") and "orders" in task_action.get("url", ""):
correct_field = probe["order_field"]
old_body = task_action.get("body", {})
corrected_body = {}
for k, v in old_body.items():
if k in ("qty", "quantity", "amount", "count"):
corrected_body[correct_field] = v
else:
corrected_body[k] = v
task_action = {**task_action, "body": corrected_body}
belief_state = generate_belief(completion, probe_response, history_response)
try:
result = env_finish_episode(session_id, task_action, belief_state)
except Exception:
return 0.0, 0.0, 0.0
task_r = float(result.get('task_reward', 0.0))
belief_r = float(result.get('belief_accuracy', 0.0))
reward = task_r * (1.0 + 0.6 * belief_r)
if task_r > 0.4 and belief_r < 0.2:
reward *= 0.6
return task_r, belief_r, float(np.clip(reward, -1.0, 1.0))
def reward_fn(completions, prompts, **kwargs):
global TRAINING_STEP
TRAINING_STEP += 1
if TRAINING_STEP < 30: difficulty = 'easy'
elif TRAINING_STEP < 60: difficulty = 'easy' if TRAINING_STEP % 3 != 0 else 'medium'
elif TRAINING_STEP < 80: difficulty = 'medium'
else: difficulty = 'hard'
rewards, step_tasks, step_beliefs = [], [], []
for completion, prompt in zip(completions, prompts):
scenario = ''
try:
m = re.search(r'<\|im_start\|>user\n(.*?)<\|im_end\|>', str(prompt), re.DOTALL)
if m: scenario = m.group(1).strip()
except Exception:
pass
task_r, belief_r, combined = run_full_episode(completion, scenario, difficulty)
r = float(np.clip(combined, -1.0, 1.0))
task_rewards_log.append(task_r)
belief_accuracy_log.append(belief_r)
combined_rewards_log.append(r)
training_steps_log.append(TRAINING_STEP)
step_tasks.append(task_r)
step_beliefs.append(belief_r)
rewards.append(r)
if TRAINING_STEP % 10 == 0:
print(f' [Step {TRAINING_STEP:3d} | {difficulty:6s}] '
f'task={np.mean(step_tasks):.3f} | '
f'belief={np.mean(step_beliefs):.3f} | '
f'combined={np.mean(rewards):.3f}')
return rewards
print('✓ Reward function ready')
SYSTEM_PROMPT = """You are an agent completing professional API tasks.
The API world may mutate mid-episode WITHOUT warning. You will NOT be told when drift occurs.
You MUST output a single JSON object with ALL of these fields:
{
"action_type": "call_api",
"method": "POST",
"url": "/mock_api/orders",
"body": {"qty": 2, "product_id": 5},
"belief_state": {
"drift_detected": false,
"field_name": "qty",
"endpoint": "/mock_api/orders",
"what_changed": "nothing yet"
}
}
Rules:
- belief_state is REQUIRED — missing it costs you points
- drift_detected must be true if you suspect any field/endpoint changed
- what_changed must describe what you think changed
- Output ONLY valid JSON. No markdown. No explanation."""
SCENARIO_PROMPTS = [
"Place an order for product_id 5, quantity 2. POST /mock_api/orders. The quantity field name may change after your first call.",
"Book a standard room for 2 nights. POST /mock_api/rooms/book. The booking endpoint may version-bump mid-task.",
"Search flights BOM to DEL departing tomorrow. GET /mock_api/flights/search. Auth scheme may change from Bearer to ApiKey.",
"File insurance claim for policy_id 1234, amount 5000. POST /mock_api/claims. Both endpoint and field names may change.",
"Apply discount code SAVE20. POST /mock_api/discount/apply. A membership_tier policy requirement may appear.",
"Book 1 conference room for 2 nights. POST /mock_api/rooms/book. Rate limits or endpoint may change.",
"Search for electronics products. GET /mock_api/products. Response keys may rename (products to items, price to cost).",
]
def make_prompts(n=200):
data = []
for i in range(n):
scenario = SCENARIO_PROMPTS[i % len(SCENARIO_PROMPTS)]
data.append({
"prompt": f"<|im_start|>system\n{SYSTEM_PROMPT}<|im_end|>\n<|im_start|>user\n{scenario}<|im_end|>\n<|im_start|>assistant\n"
})
return data
prompts_dataset = Dataset.from_list(make_prompts(200))
print(f"✓ Dataset: {len(prompts_dataset)} prompts")
print(f"Loading {MODEL_NAME}...")
MAX_SEQ_LEN = 1024
model, tokenizer = FastLanguageModel.from_pretrained(
model_name=MODEL_NAME,
max_seq_length=MAX_SEQ_LEN,
dtype=torch.float16,
load_in_4bit=False,
)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
model = FastLanguageModel.get_peft_model(
model,
r=16,
target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
lora_alpha=32,
lora_dropout=0.05,
bias="none",
use_gradient_checkpointing="unsloth",
random_state=42,
)
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total = sum(p.numel() for p in model.parameters())
print(f"✓ Loaded at float16, LoRA applied")
print(f" Trainable: {trainable/1e6:.1f}M / {total/1e9:.2f}B ({100*trainable/total:.2f}%)")
training_args = GRPOConfig(
temperature=0.8,
learning_rate=5e-6,
weight_decay=0.01,
warmup_ratio=0.05,
lr_scheduler_type="cosine",
optim="adamw_8bit",
max_grad_norm=0.3,
logging_steps=5,
output_dir="adaptive-world-grpo",
per_device_train_batch_size=1,
gradient_accumulation_steps=4,
num_generations=6,
max_prompt_length=512,
max_completion_length=200,
max_steps=100, # reduced: 3-phase is ~2x slower
save_steps=50,
fp16=True,
dataloader_num_workers=0,
remove_unused_columns=False,
)
trainer = GRPOTrainer(
model=model,
processing_class=tokenizer,
reward_funcs=reward_fn,
args=training_args,
train_dataset=prompts_dataset,
)
print("✓ Trainer ready")
print(f" max_steps={training_args.max_steps} | num_generations={training_args.num_generations} | lr={training_args.learning_rate}")
test = '{"action_type": "call_api", "method": "POST", "url": "/mock_api/orders", "body": {"qty": 2, "product_id": 5}, "belief_state": {"drift_detected": false, "field_name": "qty", "endpoint": "/mock_api/orders", "what_changed": "nothing yet"}}'
task_r, belief_r, combined = run_full_episode(test, scenario="Place an order for product_id 5, quantity 2.", difficulty="easy")
print(f"task={task_r} belief={belief_r} combined={combined}")
import httpx, json
test_action = {"action_type": "call_api", "method": "POST",
"url": "/mock_api/orders", "body": {"qty": 2, "product_id": 5}}
with httpx.Client(base_url=ENV_URL, timeout=60) as c:
r = c.post("/start_episode", json={
"scenario_id": "auto",
"difficulty": "easy",
"task_action": test_action,
})
start_data = r.json()
print("session_id:", start_data.get("session_id"))
print("probe_response:", json.dumps(start_data.get("probe_response"), indent=2))
print("history_response:", json.dumps(start_data.get("history_response"), indent=2))
# Now finish with correct fields
belief = {
"order_field": "qty",
"drift_detected": False,
"endpoint": "/mock_api/orders",
}
with httpx.Client(base_url=ENV_URL, timeout=60) as c:
r = c.post("/finish_episode", json={
"session_id": start_data["session_id"],
"task_action": test_action,
"belief_state": belief,
})
result = r.json()
print("result:", json.dumps(result, indent=2))
task_rewards_log = []
belief_accuracy_log = []
combined_rewards_log = []
training_steps_log = []
TRAINING_STEP = 0
print("✓ Tracking initialized")
print("Starting GRPO training...")
print("Live updates every 10 steps")
print("-" * 60)
trainer.train()
print("-" * 60)
print("✓ Training complete")
print(f" task (last 20): {np.mean(task_rewards_log[-20:]):.4f}")
print(f" belief (last 20): {np.mean(belief_accuracy_log[-20:]):.4f}")
def smooth(data, window=20):
if len(data) < window:
return np.array(data)
return np.convolve(data, np.ones(window)/window, mode="valid")
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
fig.suptitle("AdaptiveWorld GRPO Training", fontsize=14, fontweight="bold")
steps = np.arange(len(task_rewards_log))
ax1 = axes[0]
ax1.plot(steps, task_rewards_log, alpha=0.2, color="steelblue")
if len(task_rewards_log) >= 20:
ax1.plot(np.arange(len(smooth(task_rewards_log))), smooth(task_rewards_log), color="steelblue", linewidth=2)
ax1.axhline(0.08, color="red", linestyle="--", alpha=0.6, label="baseline")
ax1.set_title("Task Completion Rate", fontweight="bold")
ax1.set_xlabel("Step"); ax1.set_ylabel("task_reward")
ax1.set_ylim(0, 1.0); ax1.legend(); ax1.grid(alpha=0.3)
ax2 = axes[1]
ax2.plot(steps, belief_accuracy_log, alpha=0.2, color="darkorange")
if len(belief_accuracy_log) >= 20:
ax2.plot(np.arange(len(smooth(belief_accuracy_log))), smooth(belief_accuracy_log), color="darkorange", linewidth=2)
ax2.axhline(0.12, color="red", linestyle="--", alpha=0.6, label="baseline")
ax2.set_title("Belief Accuracy", fontweight="bold")
ax2.set_xlabel("Step"); ax2.set_ylabel("belief_accuracy")
ax2.set_ylim(0, 1.0); ax2.legend(); ax2.grid(alpha=0.3)
ax3 = axes[2]
mid = len(task_rewards_log) // 2
if mid > 0:
ax3.scatter(task_rewards_log[:mid], belief_accuracy_log[:mid], alpha=0.4, color="tomato", s=20, label="early")
if mid < len(task_rewards_log):
ax3.scatter(task_rewards_log[mid:], belief_accuracy_log[mid:], alpha=0.4, color="seagreen", s=20, label="late")
if mid > 1:
r_e = np.corrcoef(task_rewards_log[:mid], belief_accuracy_log[:mid])[0,1]
r_l = np.corrcoef(task_rewards_log[mid:], belief_accuracy_log[mid:])[0,1]
ax3.text(0.05, 0.95, f"early r={r_e:.2f}", transform=ax3.transAxes, color="tomato", fontsize=10)
ax3.text(0.05, 0.88, f"late r={r_l:.2f}", transform=ax3.transAxes, color="seagreen", fontsize=10)
ax3.set_title("Correlation: Task vs Belief", fontweight="bold")
ax3.set_xlabel("task_reward"); ax3.set_ylabel("belief_accuracy")
ax3.set_xlim(0, 1.0); ax3.set_ylim(0, 1.0); ax3.legend(); ax3.grid(alpha=0.3)
plt.tight_layout()
plt.savefig("training_curves.png", dpi=150, bbox_inches="tight")
plt.show()
print("✓ Saved: training_curves.png")
if task_rewards_log:
n = len(task_rewards_log)
q = max(1, n // 4)
print("=" * 50)
print("RESULTS")
print("=" * 50)
print(f" task before: {np.mean(task_rewards_log[:q]):.2%} after: {np.mean(task_rewards_log[-q:]):.2%}")
print(f" belief before: {np.mean(belief_accuracy_log[:q]):.2%} after: {np.mean(belief_accuracy_log[-q:]):.2%}")
if n > 4:
mid = n // 2
r_e = np.corrcoef(task_rewards_log[:mid], belief_accuracy_log[:mid])[0,1]
r_l = np.corrcoef(task_rewards_log[mid:], belief_accuracy_log[mid:])[0,1]
print(f" corr early: {r_e:.3f} late: {r_l:.3f}")
print("=" * 50)
def print_summary():
if not task_rewards_log:
print('No data. Run training first.')
return
n = len(task_rewards_log)
q = max(1, n // 4)
early_t = task_rewards_log[:q]
late_t = task_rewards_log[-q:]
early_b = belief_accuracy_log[:q]
late_b = belief_accuracy_log[-q:]
print('=' * 55)
print('NUMBERS FOR Q&A / PITCH')
print('=' * 55)
print(f' Task — before: {np.mean(early_t):.2%} → after: {np.mean(late_t):.2%}')
print(f' Belief — before: {np.mean(early_b):.2%} → after: {np.mean(late_b):.2%}')
if n > 4:
mid = n // 2
r_e = np.corrcoef(task_rewards_log[:mid], belief_accuracy_log[:mid])[0,1]
r_l = np.corrcoef(task_rewards_log[mid:], belief_accuracy_log[mid:])[0,1]
print(f' Corr — early: {r_e:.3f} → late: {r_l:.3f}')
print(f' Points: {n} | Per step: {training_args.num_generations}') # ← fixed: training_args not config
print('=' * 55)
print_summary()
print('=== DEMO: H1 — Silent Semantic Drift ===')
print('This is the scenario that makes judges stop.')
print('API returns 200 OK. No error. But status values changed.')
print()
HF_REPO = 'ProthamD/adaptive-world-grpo-qwen2.5-3b'
# Uncomment to push:
# from huggingface_hub import login
# login()
# model.save_pretrained_merged(HF_REPO, tokenizer, save_method="merged_16bit")
# print(f'✓ Pushed to https://huggingface.co/{HF_REPO}')
# FIX: don't use model.push_to_hub() directly on LoRA model —
# use save_pretrained_merged to merge weights first, then push.
# model.push_to_hub() on a PEFT model pushes only adapters, not merged weights.
print(f'Will push to: {HF_REPO}')
print('Uncomment lines above. Uses save_pretrained_merged not push_to_hub.')