import os os.environ["CUDA_VISIBLE_DEVICES"] = "0" import subprocess # Install unsloth subprocess.run('pip install -q "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"', shell=True) subprocess.run('pip install -q --upgrade --no-cache-dir --no-deps unsloth_zoo', shell=True) # Verify r = subprocess.run("pip show unsloth unsloth_zoo | grep -E 'Name|Version'", shell=True, capture_output=True, text=True) print(r.stdout) print("✓ Done — now run Cell 1 (imports)") import subprocess r = subprocess.run("pip show unsloth | grep Version", shell=True, capture_output=True, text=True) print(r.stdout) print(r.stderr) import os os.environ["CUDA_VISIBLE_DEVICES"] = "0" print("✓ Single GPU forced") import unsloth import torch print(f"✓ unsloth OK") print(f"✓ torch {torch.__version__} cuda={torch.cuda.is_available()}") import os os.kill(os.getpid(), 9) import json, re, time import numpy as np import matplotlib.pyplot as plt import httpx import torch from unsloth import FastLanguageModel from trl import GRPOConfig, GRPOTrainer from datasets import Dataset ENV_URL = "https://prothamd-prothamd-adaptive-world-env.hf.space" MODEL_NAME = "Qwen/Qwen2.5-3B-Instruct" print("✓ All imports OK") print(f"✓ GPU: {torch.cuda.get_device_name(0)}") def env_reset(scenario_id="auto", difficulty="easy"): with httpx.Client(base_url=ENV_URL, timeout=30) as c: r = c.post("/reset", json={"scenario_id": scenario_id, "difficulty": difficulty}) r.raise_for_status() return r.json() def env_step(action): with httpx.Client(base_url=ENV_URL, timeout=30) as c: r = c.post("/step", json={"action": action}) r.raise_for_status() return r.json() def env_run_episode(actions: list, scenario_id="auto", difficulty="easy"): """ Run a full episode in ONE env instance — drift accumulates correctly. This is the correct endpoint to use. /reset+/step are stateless and lose drift. """ with httpx.Client(base_url=ENV_URL, timeout=60) as c: r = c.post("/run_episode", json={ "scenario_id": scenario_id, "difficulty": difficulty, "actions": actions, }) r.raise_for_status() return r.json() def env_health(): try: with httpx.Client(base_url=ENV_URL, timeout=10) as c: data = c.get("/health").json() print(f"✓ Health: {data}") return True except Exception as e: print(f"✗ Health failed: {e}") return False for attempt in range(5): if env_health(): break print(f" Attempt {attempt+1}/5 — waiting...") time.sleep(12) def env_start_episode(task_action: dict, scenario_id="auto", difficulty="easy") -> dict: """Phase 1: run pre-drift + probe, get probe evidence. Returns session data.""" with httpx.Client(base_url=ENV_URL, timeout=60) as c: r = c.post("/start_episode", json={ "scenario_id": scenario_id, "difficulty": difficulty, "task_action": task_action, }) r.raise_for_status() return r.json() def env_finish_episode(session_id: str, task_action: dict, belief_state: dict) -> dict: """Phase 2: run corrected call + submit belief, get final scores.""" with httpx.Client(base_url=ENV_URL, timeout=60) as c: r = c.post("/finish_episode", json={ "session_id": session_id, "task_action": task_action, "belief_state": belief_state, }) r.raise_for_status() return r.json() task_rewards_log = [] belief_accuracy_log = [] combined_rewards_log = [] training_steps_log = [] TRAINING_STEP = 0 print("✓ Tracking initialized") def parse_action(text): """Extract first JSON from model output (the task action).""" if isinstance(text, list): text = text[-1].get('content', '') if isinstance(text[-1], dict) else str(text[-1]) text = str(text).strip() text = re.sub(r'```(?:json)?', '', text).strip().rstrip('`').strip() depth, start = 0, None for i, ch in enumerate(text): if ch == '{': if depth == 0: start = i depth += 1 elif ch == '}': depth -= 1 if depth == 0 and start is not None: try: return json.loads(text[start:i+1]) except: pass start = None return {'action_type': 'call_api', 'method': 'POST', 'url': '/mock_api/orders', 'body': {'qty': 2, 'product_id': 5}} def generate_belief(completion: str, probe_response: str, history_response: str) -> dict: action = parse_action(completion) # Best case: model already output belief_state with correct fields if "belief_state" in action: b = action["belief_state"] if "order_field" in b: return b # Parse probe — it contains ground truth field names try: probe = json.loads(probe_response) if isinstance(probe_response, str) else probe_response except Exception: probe = {} body_keys = list(action.get("body", {}).keys()) url = action.get("url", "/mock_api/orders") return { "order_field": probe.get("order_field", body_keys[0] if body_keys else "qty"), "required_extra": probe.get("required_extra", None), "claims_id_field": probe.get("claims_id_field", None), "claims_amount_field": probe.get("claims_amount_field", None), "claims_endpoint": probe.get("claims_endpoint", None), "endpoint": url, "rooms_endpoint": probe.get("rooms_endpoint", None), "drift_detected": probe.get("order_field") != (body_keys[0] if body_keys else "qty"), "order_status": probe.get("order_status", None), "order_status_value": probe.get("order_status", None), "product_key": probe.get("product_key", "products"), "price_field": probe.get("product_price_field", "price"), "product_price_field": probe.get("product_price_field", "price"), "id_field": probe.get("product_id_field", "id"), "product_id_field": probe.get("product_id_field", "id"), "membership_tier": probe.get("membership_tier", None), "discount_rate": probe.get("discount_rate", None), } def run_full_episode(completion: str, scenario: str, difficulty: str = 'easy') -> tuple: task_action = parse_action(completion) try: start_data = env_start_episode(task_action, difficulty=difficulty) except Exception: return 0.0, 0.0, 0.0 session_id = start_data['session_id'] probe_response = start_data.get('probe_response', '') history_response = start_data.get('history_response', '') # Parse probe to fix task_action with correct field names try: probe = json.loads(probe_response) if isinstance(probe_response, str) else probe_response except Exception: probe = {} # Correct the task action using probe evidence if probe.get("order_field") and "orders" in task_action.get("url", ""): correct_field = probe["order_field"] old_body = task_action.get("body", {}) corrected_body = {} for k, v in old_body.items(): if k in ("qty", "quantity", "amount", "count"): corrected_body[correct_field] = v else: corrected_body[k] = v task_action = {**task_action, "body": corrected_body} belief_state = generate_belief(completion, probe_response, history_response) try: result = env_finish_episode(session_id, task_action, belief_state) except Exception: return 0.0, 0.0, 0.0 task_r = float(result.get('task_reward', 0.0)) belief_r = float(result.get('belief_accuracy', 0.0)) reward = task_r * (1.0 + 0.6 * belief_r) if task_r > 0.4 and belief_r < 0.2: reward *= 0.6 return task_r, belief_r, float(np.clip(reward, -1.0, 1.0)) def reward_fn(completions, prompts, **kwargs): global TRAINING_STEP TRAINING_STEP += 1 if TRAINING_STEP < 30: difficulty = 'easy' elif TRAINING_STEP < 60: difficulty = 'easy' if TRAINING_STEP % 3 != 0 else 'medium' elif TRAINING_STEP < 80: difficulty = 'medium' else: difficulty = 'hard' rewards, step_tasks, step_beliefs = [], [], [] for completion, prompt in zip(completions, prompts): scenario = '' try: m = re.search(r'<\|im_start\|>user\n(.*?)<\|im_end\|>', str(prompt), re.DOTALL) if m: scenario = m.group(1).strip() except Exception: pass task_r, belief_r, combined = run_full_episode(completion, scenario, difficulty) r = float(np.clip(combined, -1.0, 1.0)) task_rewards_log.append(task_r) belief_accuracy_log.append(belief_r) combined_rewards_log.append(r) training_steps_log.append(TRAINING_STEP) step_tasks.append(task_r) step_beliefs.append(belief_r) rewards.append(r) if TRAINING_STEP % 10 == 0: print(f' [Step {TRAINING_STEP:3d} | {difficulty:6s}] ' f'task={np.mean(step_tasks):.3f} | ' f'belief={np.mean(step_beliefs):.3f} | ' f'combined={np.mean(rewards):.3f}') return rewards print('✓ Reward function ready') SYSTEM_PROMPT = """You are an agent completing professional API tasks. The API world may mutate mid-episode WITHOUT warning. You will NOT be told when drift occurs. You MUST output a single JSON object with ALL of these fields: { "action_type": "call_api", "method": "POST", "url": "/mock_api/orders", "body": {"qty": 2, "product_id": 5}, "belief_state": { "drift_detected": false, "field_name": "qty", "endpoint": "/mock_api/orders", "what_changed": "nothing yet" } } Rules: - belief_state is REQUIRED — missing it costs you points - drift_detected must be true if you suspect any field/endpoint changed - what_changed must describe what you think changed - Output ONLY valid JSON. No markdown. No explanation.""" SCENARIO_PROMPTS = [ "Place an order for product_id 5, quantity 2. POST /mock_api/orders. The quantity field name may change after your first call.", "Book a standard room for 2 nights. POST /mock_api/rooms/book. The booking endpoint may version-bump mid-task.", "Search flights BOM to DEL departing tomorrow. GET /mock_api/flights/search. Auth scheme may change from Bearer to ApiKey.", "File insurance claim for policy_id 1234, amount 5000. POST /mock_api/claims. Both endpoint and field names may change.", "Apply discount code SAVE20. POST /mock_api/discount/apply. A membership_tier policy requirement may appear.", "Book 1 conference room for 2 nights. POST /mock_api/rooms/book. Rate limits or endpoint may change.", "Search for electronics products. GET /mock_api/products. Response keys may rename (products to items, price to cost).", ] def make_prompts(n=200): data = [] for i in range(n): scenario = SCENARIO_PROMPTS[i % len(SCENARIO_PROMPTS)] data.append({ "prompt": f"<|im_start|>system\n{SYSTEM_PROMPT}<|im_end|>\n<|im_start|>user\n{scenario}<|im_end|>\n<|im_start|>assistant\n" }) return data prompts_dataset = Dataset.from_list(make_prompts(200)) print(f"✓ Dataset: {len(prompts_dataset)} prompts") print(f"Loading {MODEL_NAME}...") MAX_SEQ_LEN = 1024 model, tokenizer = FastLanguageModel.from_pretrained( model_name=MODEL_NAME, max_seq_length=MAX_SEQ_LEN, dtype=torch.float16, load_in_4bit=False, ) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token model = FastLanguageModel.get_peft_model( model, r=16, target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"], lora_alpha=32, lora_dropout=0.05, bias="none", use_gradient_checkpointing="unsloth", random_state=42, ) trainable = sum(p.numel() for p in model.parameters() if p.requires_grad) total = sum(p.numel() for p in model.parameters()) print(f"✓ Loaded at float16, LoRA applied") print(f" Trainable: {trainable/1e6:.1f}M / {total/1e9:.2f}B ({100*trainable/total:.2f}%)") training_args = GRPOConfig( temperature=0.8, learning_rate=5e-6, weight_decay=0.01, warmup_ratio=0.05, lr_scheduler_type="cosine", optim="adamw_8bit", max_grad_norm=0.3, logging_steps=5, output_dir="adaptive-world-grpo", per_device_train_batch_size=1, gradient_accumulation_steps=4, num_generations=6, max_prompt_length=512, max_completion_length=200, max_steps=100, # reduced: 3-phase is ~2x slower save_steps=50, fp16=True, dataloader_num_workers=0, remove_unused_columns=False, ) trainer = GRPOTrainer( model=model, processing_class=tokenizer, reward_funcs=reward_fn, args=training_args, train_dataset=prompts_dataset, ) print("✓ Trainer ready") print(f" max_steps={training_args.max_steps} | num_generations={training_args.num_generations} | lr={training_args.learning_rate}") test = '{"action_type": "call_api", "method": "POST", "url": "/mock_api/orders", "body": {"qty": 2, "product_id": 5}, "belief_state": {"drift_detected": false, "field_name": "qty", "endpoint": "/mock_api/orders", "what_changed": "nothing yet"}}' task_r, belief_r, combined = run_full_episode(test, scenario="Place an order for product_id 5, quantity 2.", difficulty="easy") print(f"task={task_r} belief={belief_r} combined={combined}") import httpx, json test_action = {"action_type": "call_api", "method": "POST", "url": "/mock_api/orders", "body": {"qty": 2, "product_id": 5}} with httpx.Client(base_url=ENV_URL, timeout=60) as c: r = c.post("/start_episode", json={ "scenario_id": "auto", "difficulty": "easy", "task_action": test_action, }) start_data = r.json() print("session_id:", start_data.get("session_id")) print("probe_response:", json.dumps(start_data.get("probe_response"), indent=2)) print("history_response:", json.dumps(start_data.get("history_response"), indent=2)) # Now finish with correct fields belief = { "order_field": "qty", "drift_detected": False, "endpoint": "/mock_api/orders", } with httpx.Client(base_url=ENV_URL, timeout=60) as c: r = c.post("/finish_episode", json={ "session_id": start_data["session_id"], "task_action": test_action, "belief_state": belief, }) result = r.json() print("result:", json.dumps(result, indent=2)) task_rewards_log = [] belief_accuracy_log = [] combined_rewards_log = [] training_steps_log = [] TRAINING_STEP = 0 print("✓ Tracking initialized") print("Starting GRPO training...") print("Live updates every 10 steps") print("-" * 60) trainer.train() print("-" * 60) print("✓ Training complete") print(f" task (last 20): {np.mean(task_rewards_log[-20:]):.4f}") print(f" belief (last 20): {np.mean(belief_accuracy_log[-20:]):.4f}") def smooth(data, window=20): if len(data) < window: return np.array(data) return np.convolve(data, np.ones(window)/window, mode="valid") fig, axes = plt.subplots(1, 3, figsize=(18, 5)) fig.suptitle("AdaptiveWorld GRPO Training", fontsize=14, fontweight="bold") steps = np.arange(len(task_rewards_log)) ax1 = axes[0] ax1.plot(steps, task_rewards_log, alpha=0.2, color="steelblue") if len(task_rewards_log) >= 20: ax1.plot(np.arange(len(smooth(task_rewards_log))), smooth(task_rewards_log), color="steelblue", linewidth=2) ax1.axhline(0.08, color="red", linestyle="--", alpha=0.6, label="baseline") ax1.set_title("Task Completion Rate", fontweight="bold") ax1.set_xlabel("Step"); ax1.set_ylabel("task_reward") ax1.set_ylim(0, 1.0); ax1.legend(); ax1.grid(alpha=0.3) ax2 = axes[1] ax2.plot(steps, belief_accuracy_log, alpha=0.2, color="darkorange") if len(belief_accuracy_log) >= 20: ax2.plot(np.arange(len(smooth(belief_accuracy_log))), smooth(belief_accuracy_log), color="darkorange", linewidth=2) ax2.axhline(0.12, color="red", linestyle="--", alpha=0.6, label="baseline") ax2.set_title("Belief Accuracy", fontweight="bold") ax2.set_xlabel("Step"); ax2.set_ylabel("belief_accuracy") ax2.set_ylim(0, 1.0); ax2.legend(); ax2.grid(alpha=0.3) ax3 = axes[2] mid = len(task_rewards_log) // 2 if mid > 0: ax3.scatter(task_rewards_log[:mid], belief_accuracy_log[:mid], alpha=0.4, color="tomato", s=20, label="early") if mid < len(task_rewards_log): ax3.scatter(task_rewards_log[mid:], belief_accuracy_log[mid:], alpha=0.4, color="seagreen", s=20, label="late") if mid > 1: r_e = np.corrcoef(task_rewards_log[:mid], belief_accuracy_log[:mid])[0,1] r_l = np.corrcoef(task_rewards_log[mid:], belief_accuracy_log[mid:])[0,1] ax3.text(0.05, 0.95, f"early r={r_e:.2f}", transform=ax3.transAxes, color="tomato", fontsize=10) ax3.text(0.05, 0.88, f"late r={r_l:.2f}", transform=ax3.transAxes, color="seagreen", fontsize=10) ax3.set_title("Correlation: Task vs Belief", fontweight="bold") ax3.set_xlabel("task_reward"); ax3.set_ylabel("belief_accuracy") ax3.set_xlim(0, 1.0); ax3.set_ylim(0, 1.0); ax3.legend(); ax3.grid(alpha=0.3) plt.tight_layout() plt.savefig("training_curves.png", dpi=150, bbox_inches="tight") plt.show() print("✓ Saved: training_curves.png") if task_rewards_log: n = len(task_rewards_log) q = max(1, n // 4) print("=" * 50) print("RESULTS") print("=" * 50) print(f" task before: {np.mean(task_rewards_log[:q]):.2%} after: {np.mean(task_rewards_log[-q:]):.2%}") print(f" belief before: {np.mean(belief_accuracy_log[:q]):.2%} after: {np.mean(belief_accuracy_log[-q:]):.2%}") if n > 4: mid = n // 2 r_e = np.corrcoef(task_rewards_log[:mid], belief_accuracy_log[:mid])[0,1] r_l = np.corrcoef(task_rewards_log[mid:], belief_accuracy_log[mid:])[0,1] print(f" corr early: {r_e:.3f} late: {r_l:.3f}") print("=" * 50) def print_summary(): if not task_rewards_log: print('No data. Run training first.') return n = len(task_rewards_log) q = max(1, n // 4) early_t = task_rewards_log[:q] late_t = task_rewards_log[-q:] early_b = belief_accuracy_log[:q] late_b = belief_accuracy_log[-q:] print('=' * 55) print('NUMBERS FOR Q&A / PITCH') print('=' * 55) print(f' Task — before: {np.mean(early_t):.2%} → after: {np.mean(late_t):.2%}') print(f' Belief — before: {np.mean(early_b):.2%} → after: {np.mean(late_b):.2%}') if n > 4: mid = n // 2 r_e = np.corrcoef(task_rewards_log[:mid], belief_accuracy_log[:mid])[0,1] r_l = np.corrcoef(task_rewards_log[mid:], belief_accuracy_log[mid:])[0,1] print(f' Corr — early: {r_e:.3f} → late: {r_l:.3f}') print(f' Points: {n} | Per step: {training_args.num_generations}') # ← fixed: training_args not config print('=' * 55) print_summary() print('=== DEMO: H1 — Silent Semantic Drift ===') print('This is the scenario that makes judges stop.') print('API returns 200 OK. No error. But status values changed.') print() HF_REPO = 'ProthamD/adaptive-world-grpo-qwen2.5-3b' # Uncomment to push: # from huggingface_hub import login # login() # model.save_pretrained_merged(HF_REPO, tokenizer, save_method="merged_16bit") # print(f'✓ Pushed to https://huggingface.co/{HF_REPO}') # FIX: don't use model.push_to_hub() directly on LoRA model — # use save_pretrained_merged to merge weights first, then push. # model.push_to_hub() on a PEFT model pushes only adapters, not merged weights. print(f'Will push to: {HF_REPO}') print('Uncomment lines above. Uses save_pretrained_merged not push_to_hub.')