初始化项目，由ModelHub XC社区提供模型

Model: ProthamD/adaptive-world-grpo-qwen2.5-3b Source: Original Platform
2026-05-01 13:30:11 +08:00
commit 2610e5a267
26 changed files with 155437 additions and 0 deletions
--- a/adaptive.ipynb
+++ b/adaptive.ipynb
@@ -0,0 +1,565 @@
+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+
+import subprocess
+
+# Install unsloth
+subprocess.run('pip install -q "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"', shell=True)
+subprocess.run('pip install -q --upgrade --no-cache-dir --no-deps unsloth_zoo', shell=True)
+
+# Verify
+r = subprocess.run("pip show unsloth unsloth_zoo | grep -E 'Name|Version'", shell=True, capture_output=True, text=True)
+print(r.stdout)
+print("✓ Done — now run Cell 1 (imports)")
+
+
+import subprocess
+r = subprocess.run("pip show unsloth | grep Version", shell=True, capture_output=True, text=True)
+print(r.stdout)
+print(r.stderr)
+
+
+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+print("✓ Single GPU forced")
+
+
+import unsloth
+import torch
+print(f"✓ unsloth OK")
+print(f"✓ torch {torch.__version__}  cuda={torch.cuda.is_available()}")
+
+
+import os
+os.kill(os.getpid(), 9)
+
+
+import json, re, time
+import numpy as np
+import matplotlib.pyplot as plt
+import httpx
+import torch
+from unsloth import FastLanguageModel
+from trl import GRPOConfig, GRPOTrainer
+from datasets import Dataset
+
+ENV_URL    = "https://prothamd-prothamd-adaptive-world-env.hf.space"
+MODEL_NAME = "Qwen/Qwen2.5-3B-Instruct"
+
+print("✓ All imports OK")
+print(f"✓ GPU: {torch.cuda.get_device_name(0)}")
+
+
+
+
+
+def env_reset(scenario_id="auto", difficulty="easy"):
+    with httpx.Client(base_url=ENV_URL, timeout=30) as c:
+        r = c.post("/reset", json={"scenario_id": scenario_id, "difficulty": difficulty})
+        r.raise_for_status()
+        return r.json()
+
+def env_step(action):
+    with httpx.Client(base_url=ENV_URL, timeout=30) as c:
+        r = c.post("/step", json={"action": action})
+        r.raise_for_status()
+        return r.json()
+
+def env_run_episode(actions: list, scenario_id="auto", difficulty="easy"):
+    """
+    Run a full episode in ONE env instance — drift accumulates correctly.
+    This is the correct endpoint to use. /reset+/step are stateless and lose drift.
+    """
+    with httpx.Client(base_url=ENV_URL, timeout=60) as c:
+        r = c.post("/run_episode", json={
+            "scenario_id": scenario_id,
+            "difficulty": difficulty,
+            "actions": actions,
+        })
+        r.raise_for_status()
+        return r.json()
+
+def env_health():
+    try:
+        with httpx.Client(base_url=ENV_URL, timeout=10) as c:
+            data = c.get("/health").json()
+            print(f"✓ Health: {data}")
+            return True
+    except Exception as e:
+        print(f"✗ Health failed: {e}")
+        return False
+
+for attempt in range(5):
+    if env_health():
+        break
+    print(f"  Attempt {attempt+1}/5 — waiting...")
+    time.sleep(12)
+
+def env_start_episode(task_action: dict, scenario_id="auto", difficulty="easy") -> dict:
+    """Phase 1: run pre-drift + probe, get probe evidence. Returns session data."""
+    with httpx.Client(base_url=ENV_URL, timeout=60) as c:
+        r = c.post("/start_episode", json={
+            "scenario_id": scenario_id,
+            "difficulty":  difficulty,
+            "task_action": task_action,
+        })
+        r.raise_for_status()
+        return r.json()
+
+
+def env_finish_episode(session_id: str, task_action: dict, belief_state: dict) -> dict:
+    """Phase 2: run corrected call + submit belief, get final scores."""
+    with httpx.Client(base_url=ENV_URL, timeout=60) as c:
+        r = c.post("/finish_episode", json={
+            "session_id":   session_id,
+            "task_action":  task_action,
+            "belief_state": belief_state,
+        })
+        r.raise_for_status()
+        return r.json()
+
+
+
+task_rewards_log    = []
+belief_accuracy_log = []
+combined_rewards_log = []
+training_steps_log  = []
+TRAINING_STEP = 0
+
+print("✓ Tracking initialized")
+
+
+def parse_action(text):
+    """Extract first JSON from model output (the task action)."""
+    if isinstance(text, list):
+        text = text[-1].get('content', '') if isinstance(text[-1], dict) else str(text[-1])
+    text = str(text).strip()
+    text = re.sub(r'```(?:json)?', '', text).strip().rstrip('`').strip()
+    depth, start = 0, None
+    for i, ch in enumerate(text):
+        if ch == '{':
+            if depth == 0: start = i
+            depth += 1
+        elif ch == '}':
+            depth -= 1
+            if depth == 0 and start is not None:
+                try: return json.loads(text[start:i+1])
+                except: pass
+                start = None
+    return {'action_type': 'call_api', 'method': 'POST',
+            'url': '/mock_api/orders', 'body': {'qty': 2, 'product_id': 5}}
+
+
+def generate_belief(completion: str, probe_response: str, history_response: str) -> dict:
+    action = parse_action(completion)
+
+    # Best case: model already output belief_state with correct fields
+    if "belief_state" in action:
+        b = action["belief_state"]
+        if "order_field" in b:
+            return b
+
+    # Parse probe — it contains ground truth field names
+    try:
+        probe = json.loads(probe_response) if isinstance(probe_response, str) else probe_response
+    except Exception:
+        probe = {}
+
+    body_keys = list(action.get("body", {}).keys())
+    url = action.get("url", "/mock_api/orders")
+
+    return {
+        "order_field":         probe.get("order_field", body_keys[0] if body_keys else "qty"),
+        "required_extra":      probe.get("required_extra", None),
+        "claims_id_field":     probe.get("claims_id_field", None),
+        "claims_amount_field": probe.get("claims_amount_field", None),
+        "claims_endpoint":     probe.get("claims_endpoint", None),
+        "endpoint":            url,
+        "rooms_endpoint":      probe.get("rooms_endpoint", None),
+        "drift_detected":      probe.get("order_field") != (body_keys[0] if body_keys else "qty"),
+        "order_status":        probe.get("order_status", None),
+        "order_status_value":  probe.get("order_status", None),
+        "product_key":         probe.get("product_key", "products"),
+        "price_field":         probe.get("product_price_field", "price"),
+        "product_price_field": probe.get("product_price_field", "price"),
+        "id_field":            probe.get("product_id_field", "id"),
+        "product_id_field":    probe.get("product_id_field", "id"),
+        "membership_tier":     probe.get("membership_tier", None),
+        "discount_rate":       probe.get("discount_rate", None),
+    }
+
+
+def run_full_episode(completion: str, scenario: str, difficulty: str = 'easy') -> tuple:
+    task_action = parse_action(completion)
+
+    try:
+        start_data = env_start_episode(task_action, difficulty=difficulty)
+    except Exception:
+        return 0.0, 0.0, 0.0
+
+    session_id       = start_data['session_id']
+    probe_response   = start_data.get('probe_response', '')
+    history_response = start_data.get('history_response', '')
+
+    # Parse probe to fix task_action with correct field names
+    try:
+        probe = json.loads(probe_response) if isinstance(probe_response, str) else probe_response
+    except Exception:
+        probe = {}
+
+    # Correct the task action using probe evidence
+    if probe.get("order_field") and "orders" in task_action.get("url", ""):
+        correct_field = probe["order_field"]
+        old_body = task_action.get("body", {})
+        corrected_body = {}
+        for k, v in old_body.items():
+            if k in ("qty", "quantity", "amount", "count"):
+                corrected_body[correct_field] = v
+            else:
+                corrected_body[k] = v
+        task_action = {**task_action, "body": corrected_body}
+
+    belief_state = generate_belief(completion, probe_response, history_response)
+
+    try:
+        result = env_finish_episode(session_id, task_action, belief_state)
+    except Exception:
+        return 0.0, 0.0, 0.0
+
+    task_r   = float(result.get('task_reward',    0.0))
+    belief_r = float(result.get('belief_accuracy', 0.0))
+    reward   = task_r * (1.0 + 0.6 * belief_r)
+    if task_r > 0.4 and belief_r < 0.2:
+        reward *= 0.6
+    return task_r, belief_r, float(np.clip(reward, -1.0, 1.0))
+
+
+def reward_fn(completions, prompts, **kwargs):
+    global TRAINING_STEP
+    TRAINING_STEP += 1
+
+    if TRAINING_STEP < 30:    difficulty = 'easy'
+    elif TRAINING_STEP < 60:  difficulty = 'easy' if TRAINING_STEP % 3 != 0 else 'medium'
+    elif TRAINING_STEP < 80:  difficulty = 'medium'
+    else:                     difficulty = 'hard'
+
+    rewards, step_tasks, step_beliefs = [], [], []
+
+    for completion, prompt in zip(completions, prompts):
+        scenario = ''
+        try:
+            m = re.search(r'<\|im_start\|>user\n(.*?)<\|im_end\|>', str(prompt), re.DOTALL)
+            if m: scenario = m.group(1).strip()
+        except Exception:
+            pass
+
+        task_r, belief_r, combined = run_full_episode(completion, scenario, difficulty)
+        r = float(np.clip(combined, -1.0, 1.0))
+        task_rewards_log.append(task_r)
+        belief_accuracy_log.append(belief_r)
+        combined_rewards_log.append(r)
+        training_steps_log.append(TRAINING_STEP)
+        step_tasks.append(task_r)
+        step_beliefs.append(belief_r)
+        rewards.append(r)
+
+    if TRAINING_STEP % 10 == 0:
+        print(f'  [Step {TRAINING_STEP:3d} | {difficulty:6s}] '
+              f'task={np.mean(step_tasks):.3f} | '
+              f'belief={np.mean(step_beliefs):.3f} | '
+              f'combined={np.mean(rewards):.3f}')
+
+    return rewards
+
+print('✓ Reward function ready')
+
+
+SYSTEM_PROMPT = """You are an agent completing professional API tasks.
+The API world may mutate mid-episode WITHOUT warning. You will NOT be told when drift occurs.
+
+You MUST output a single JSON object with ALL of these fields:
+{
+  "action_type": "call_api",
+  "method": "POST",
+  "url": "/mock_api/orders",
+  "body": {"qty": 2, "product_id": 5},
+  "belief_state": {
+    "drift_detected": false,
+    "field_name": "qty",
+    "endpoint": "/mock_api/orders",
+    "what_changed": "nothing yet"
+  }
+}
+
+Rules:
+- belief_state is REQUIRED — missing it costs you points
+- drift_detected must be true if you suspect any field/endpoint changed
+- what_changed must describe what you think changed
+- Output ONLY valid JSON. No markdown. No explanation."""
+
+SCENARIO_PROMPTS = [
+    "Place an order for product_id 5, quantity 2. POST /mock_api/orders. The quantity field name may change after your first call.",
+    "Book a standard room for 2 nights. POST /mock_api/rooms/book. The booking endpoint may version-bump mid-task.",
+    "Search flights BOM to DEL departing tomorrow. GET /mock_api/flights/search. Auth scheme may change from Bearer to ApiKey.",
+    "File insurance claim for policy_id 1234, amount 5000. POST /mock_api/claims. Both endpoint and field names may change.",
+    "Apply discount code SAVE20. POST /mock_api/discount/apply. A membership_tier policy requirement may appear.",
+    "Book 1 conference room for 2 nights. POST /mock_api/rooms/book. Rate limits or endpoint may change.",
+    "Search for electronics products. GET /mock_api/products. Response keys may rename (products to items, price to cost).",
+]
+
+def make_prompts(n=200):
+    data = []
+    for i in range(n):
+        scenario = SCENARIO_PROMPTS[i % len(SCENARIO_PROMPTS)]
+        data.append({
+            "prompt": f"<|im_start|>system\n{SYSTEM_PROMPT}<|im_end|>\n<|im_start|>user\n{scenario}<|im_end|>\n<|im_start|>assistant\n"
+        })
+    return data
+
+prompts_dataset = Dataset.from_list(make_prompts(200))
+print(f"✓ Dataset: {len(prompts_dataset)} prompts")
+
+
+print(f"Loading {MODEL_NAME}...")
+MAX_SEQ_LEN = 1024
+
+model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name=MODEL_NAME,
+    max_seq_length=MAX_SEQ_LEN,
+    dtype=torch.float16,
+    load_in_4bit=False,
+)
+
+if tokenizer.pad_token is None:
+    tokenizer.pad_token = tokenizer.eos_token
+
+model = FastLanguageModel.get_peft_model(
+    model,
+    r=16,
+    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
+    lora_alpha=32,
+    lora_dropout=0.05,
+    bias="none",
+    use_gradient_checkpointing="unsloth",
+    random_state=42,
+)
+
+trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
+total     = sum(p.numel() for p in model.parameters())
+print(f"✓ Loaded at float16, LoRA applied")
+print(f"  Trainable: {trainable/1e6:.1f}M / {total/1e9:.2f}B ({100*trainable/total:.2f}%)")
+
+
+training_args = GRPOConfig(
+    temperature=0.8,
+    learning_rate=5e-6,
+    weight_decay=0.01,
+    warmup_ratio=0.05,
+    lr_scheduler_type="cosine",
+    optim="adamw_8bit",
+    max_grad_norm=0.3,
+    logging_steps=5,
+    output_dir="adaptive-world-grpo",
+    per_device_train_batch_size=1,
+    gradient_accumulation_steps=4,
+    num_generations=6,
+    max_prompt_length=512,
+    max_completion_length=200,
+    max_steps=100,  # reduced: 3-phase is ~2x slower
+    save_steps=50,
+    fp16=True,
+    dataloader_num_workers=0,
+    remove_unused_columns=False,
+)
+
+trainer = GRPOTrainer(
+    model=model,
+    processing_class=tokenizer,
+    reward_funcs=reward_fn,
+    args=training_args,
+    train_dataset=prompts_dataset,
+)
+
+print("✓ Trainer ready")
+print(f"  max_steps={training_args.max_steps} | num_generations={training_args.num_generations} | lr={training_args.learning_rate}")
+
+
+test = '{"action_type": "call_api", "method": "POST", "url": "/mock_api/orders", "body": {"qty": 2, "product_id": 5}, "belief_state": {"drift_detected": false, "field_name": "qty", "endpoint": "/mock_api/orders", "what_changed": "nothing yet"}}'
+task_r, belief_r, combined = run_full_episode(test, scenario="Place an order for product_id 5, quantity 2.", difficulty="easy")
+print(f"task={task_r}  belief={belief_r}  combined={combined}")
+
+
+import httpx, json
+
+test_action = {"action_type": "call_api", "method": "POST", 
+               "url": "/mock_api/orders", "body": {"qty": 2, "product_id": 5}}
+
+with httpx.Client(base_url=ENV_URL, timeout=60) as c:
+    r = c.post("/start_episode", json={
+        "scenario_id": "auto",
+        "difficulty": "easy", 
+        "task_action": test_action,
+    })
+    start_data = r.json()
+
+print("session_id:", start_data.get("session_id"))
+print("probe_response:", json.dumps(start_data.get("probe_response"), indent=2))
+print("history_response:", json.dumps(start_data.get("history_response"), indent=2))
+
+# Now finish with correct fields
+belief = {
+    "order_field": "qty",
+    "drift_detected": False,
+    "endpoint": "/mock_api/orders",
+}
+with httpx.Client(base_url=ENV_URL, timeout=60) as c:
+    r = c.post("/finish_episode", json={
+        "session_id": start_data["session_id"],
+        "task_action": test_action,
+        "belief_state": belief,
+    })
+    result = r.json()
+
+print("result:", json.dumps(result, indent=2))
+
+
+task_rewards_log     = []
+belief_accuracy_log  = []
+combined_rewards_log = []
+training_steps_log   = []
+TRAINING_STEP = 0
+
+print("✓ Tracking initialized")
+
+
+print("Starting GRPO training...")
+print("Live updates every 10 steps")
+print("-" * 60)
+
+trainer.train()
+
+print("-" * 60)
+print("✓ Training complete")
+print(f"  task (last 20):   {np.mean(task_rewards_log[-20:]):.4f}")
+print(f"  belief (last 20): {np.mean(belief_accuracy_log[-20:]):.4f}")
+
+
+def smooth(data, window=20):
+    if len(data) < window:
+        return np.array(data)
+    return np.convolve(data, np.ones(window)/window, mode="valid")
+
+fig, axes = plt.subplots(1, 3, figsize=(18, 5))
+fig.suptitle("AdaptiveWorld GRPO Training", fontsize=14, fontweight="bold")
+
+steps = np.arange(len(task_rewards_log))
+
+ax1 = axes[0]
+ax1.plot(steps, task_rewards_log, alpha=0.2, color="steelblue")
+if len(task_rewards_log) >= 20:
+    ax1.plot(np.arange(len(smooth(task_rewards_log))), smooth(task_rewards_log), color="steelblue", linewidth=2)
+ax1.axhline(0.08, color="red", linestyle="--", alpha=0.6, label="baseline")
+ax1.set_title("Task Completion Rate", fontweight="bold")
+ax1.set_xlabel("Step"); ax1.set_ylabel("task_reward")
+ax1.set_ylim(0, 1.0); ax1.legend(); ax1.grid(alpha=0.3)
+
+ax2 = axes[1]
+ax2.plot(steps, belief_accuracy_log, alpha=0.2, color="darkorange")
+if len(belief_accuracy_log) >= 20:
+    ax2.plot(np.arange(len(smooth(belief_accuracy_log))), smooth(belief_accuracy_log), color="darkorange", linewidth=2)
+ax2.axhline(0.12, color="red", linestyle="--", alpha=0.6, label="baseline")
+ax2.set_title("Belief Accuracy", fontweight="bold")
+ax2.set_xlabel("Step"); ax2.set_ylabel("belief_accuracy")
+ax2.set_ylim(0, 1.0); ax2.legend(); ax2.grid(alpha=0.3)
+
+ax3 = axes[2]
+mid = len(task_rewards_log) // 2
+if mid > 0:
+    ax3.scatter(task_rewards_log[:mid], belief_accuracy_log[:mid], alpha=0.4, color="tomato", s=20, label="early")
+if mid < len(task_rewards_log):
+    ax3.scatter(task_rewards_log[mid:], belief_accuracy_log[mid:], alpha=0.4, color="seagreen", s=20, label="late")
+if mid > 1:
+    r_e = np.corrcoef(task_rewards_log[:mid], belief_accuracy_log[:mid])[0,1]
+    r_l = np.corrcoef(task_rewards_log[mid:], belief_accuracy_log[mid:])[0,1]
+    ax3.text(0.05, 0.95, f"early r={r_e:.2f}", transform=ax3.transAxes, color="tomato", fontsize=10)
+    ax3.text(0.05, 0.88, f"late  r={r_l:.2f}", transform=ax3.transAxes, color="seagreen", fontsize=10)
+ax3.set_title("Correlation: Task vs Belief", fontweight="bold")
+ax3.set_xlabel("task_reward"); ax3.set_ylabel("belief_accuracy")
+ax3.set_xlim(0, 1.0); ax3.set_ylim(0, 1.0); ax3.legend(); ax3.grid(alpha=0.3)
+
+plt.tight_layout()
+plt.savefig("training_curves.png", dpi=150, bbox_inches="tight")
+plt.show()
+print("✓ Saved: training_curves.png")
+
+
+if task_rewards_log:
+    n = len(task_rewards_log)
+    q = max(1, n // 4)
+    print("=" * 50)
+    print("RESULTS")
+    print("=" * 50)
+    print(f"  task    before: {np.mean(task_rewards_log[:q]):.2%}  after: {np.mean(task_rewards_log[-q:]):.2%}")
+    print(f"  belief  before: {np.mean(belief_accuracy_log[:q]):.2%}  after: {np.mean(belief_accuracy_log[-q:]):.2%}")
+    if n > 4:
+        mid = n // 2
+        r_e = np.corrcoef(task_rewards_log[:mid], belief_accuracy_log[:mid])[0,1]
+        r_l = np.corrcoef(task_rewards_log[mid:], belief_accuracy_log[mid:])[0,1]
+        print(f"  corr    early: {r_e:.3f}   late: {r_l:.3f}")
+    print("=" * 50)
+
+
+
+
+
+def print_summary():
+    if not task_rewards_log:
+        print('No data. Run training first.')
+        return
+    n = len(task_rewards_log)
+    q = max(1, n // 4)
+    early_t = task_rewards_log[:q]
+    late_t  = task_rewards_log[-q:]
+    early_b = belief_accuracy_log[:q]
+    late_b  = belief_accuracy_log[-q:]
+
+    print('=' * 55)
+    print('NUMBERS FOR Q&A / PITCH')
+    print('=' * 55)
+    print(f'  Task    — before: {np.mean(early_t):.2%}  →  after: {np.mean(late_t):.2%}')
+    print(f'  Belief  — before: {np.mean(early_b):.2%}  →  after: {np.mean(late_b):.2%}')
+    if n > 4:
+        mid = n // 2
+        r_e = np.corrcoef(task_rewards_log[:mid], belief_accuracy_log[:mid])[0,1]
+        r_l = np.corrcoef(task_rewards_log[mid:], belief_accuracy_log[mid:])[0,1]
+        print(f'  Corr    — early: {r_e:.3f}  →  late: {r_l:.3f}')
+    print(f'  Points: {n} | Per step: {training_args.num_generations}')  # ← fixed: training_args not config
+    print('=' * 55)
+
+print_summary()
+
+
+
+print('=== DEMO: H1 — Silent Semantic Drift ===')
+print('This is the scenario that makes judges stop.')
+print('API returns 200 OK. No error. But status values changed.')
+print()
+
+
+
+HF_REPO = 'ProthamD/adaptive-world-grpo-qwen2.5-3b'
+
+# Uncomment to push:
+# from huggingface_hub import login
+# login()
+# model.save_pretrained_merged(HF_REPO, tokenizer, save_method="merged_16bit")
+# print(f'✓ Pushed to https://huggingface.co/{HF_REPO}')
+
+# FIX: don't use model.push_to_hub() directly on LoRA model —
+# use save_pretrained_merged to merge weights first, then push.
+# model.push_to_hub() on a PEFT model pushes only adapters, not merged weights.
+print(f'Will push to: {HF_REPO}')
+print('Uncomment lines above. Uses save_pretrained_merged not push_to_hub.')
+
+
+