import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import subprocess

# Install unsloth
subprocess.run('pip install -q "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"', shell=True)
subprocess.run('pip install -q --upgrade --no-cache-dir --no-deps unsloth_zoo', shell=True)

# Verify
r = subprocess.run("pip show unsloth unsloth_zoo | grep -E 'Name|Version'", shell=True, capture_output=True, text=True)
print(r.stdout)
print("✓ Done — now run Cell 1 (imports)")


import subprocess
r = subprocess.run("pip show unsloth | grep Version", shell=True, capture_output=True, text=True)
print(r.stdout)
print(r.stderr)


import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
print("✓ Single GPU forced")


import unsloth
import torch
print(f"✓ unsloth OK")
print(f"✓ torch {torch.__version__}  cuda={torch.cuda.is_available()}")


import os
os.kill(os.getpid(), 9)


import json, re, time
import numpy as np
import matplotlib.pyplot as plt
import httpx
import torch
from unsloth import FastLanguageModel
from trl import GRPOConfig, GRPOTrainer
from datasets import Dataset

ENV_URL    = "https://prothamd-prothamd-adaptive-world-env.hf.space"
MODEL_NAME = "Qwen/Qwen2.5-3B-Instruct"

print("✓ All imports OK")
print(f"✓ GPU: {torch.cuda.get_device_name(0)}")


def env_reset(scenario_id="auto", difficulty="easy"):
    with httpx.Client(base_url=ENV_URL, timeout=30) as c:
        r = c.post("/reset", json={"scenario_id": scenario_id, "difficulty": difficulty})
        r.raise_for_status()
        return r.json()

def env_step(action):
    with httpx.Client(base_url=ENV_URL, timeout=30) as c:
        r = c.post("/step", json={"action": action})
        r.raise_for_status()
        return r.json()

def env_run_episode(actions: list, scenario_id="auto", difficulty="easy"):
    """
    Run a full episode in ONE env instance — drift accumulates correctly.
    This is the correct endpoint to use. /reset+/step are stateless and lose drift.
    """
    with httpx.Client(base_url=ENV_URL, timeout=60) as c:
        r = c.post("/run_episode", json={
            "scenario_id": scenario_id,
            "difficulty": difficulty,
            "actions": actions,
        })
        r.raise_for_status()
        return r.json()

def env_health():
    try:
        with httpx.Client(base_url=ENV_URL, timeout=10) as c:
            data = c.get("/health").json()
            print(f"✓ Health: {data}")
            return True
    except Exception as e:
        print(f"✗ Health failed: {e}")
        return False

for attempt in range(5):
    if env_health():
        break
    print(f"  Attempt {attempt+1}/5 — waiting...")
    time.sleep(12)

def env_start_episode(task_action: dict, scenario_id="auto", difficulty="easy") -> dict:
    """Phase 1: run pre-drift + probe, get probe evidence. Returns session data."""
    with httpx.Client(base_url=ENV_URL, timeout=60) as c:
        r = c.post("/start_episode", json={
            "scenario_id": scenario_id,
            "difficulty":  difficulty,
            "task_action": task_action,
        })
        r.raise_for_status()
        return r.json()


def env_finish_episode(session_id: str, task_action: dict, belief_state: dict) -> dict:
    """Phase 2: run corrected call + submit belief, get final scores."""
    with httpx.Client(base_url=ENV_URL, timeout=60) as c:
        r = c.post("/finish_episode", json={
            "session_id":   session_id,
            "task_action":  task_action,
            "belief_state": belief_state,
        })
        r.raise_for_status()
        return r.json()


task_rewards_log    = []
belief_accuracy_log = []
combined_rewards_log = []
training_steps_log  = []
TRAINING_STEP = 0

print("✓ Tracking initialized")


def parse_action(text):
    """Extract first JSON from model output (the task action)."""
    if isinstance(text, list):
        text = text[-1].get('content', '') if isinstance(text[-1], dict) else str(text[-1])
    text = str(text).strip()
    text = re.sub(r'```(?:json)?', '', text).strip().rstrip('`').strip()
    depth, start = 0, None
    for i, ch in enumerate(text):
        if ch == '{':
            if depth == 0: start = i
            depth += 1
        elif ch == '}':
            depth -= 1
            if depth == 0 and start is not None:
                try: return json.loads(text[start:i+1])
                except: pass
                start = None
    return {'action_type': 'call_api', 'method': 'POST',
            'url': '/mock_api/orders', 'body': {'qty': 2, 'product_id': 5}}


def generate_belief(completion: str, probe_response: str, history_response: str) -> dict:
    action = parse_action(completion)

    # Best case: model already output belief_state with correct fields
    if "belief_state" in action:
        b = action["belief_state"]
        if "order_field" in b:
            return b

    # Parse probe — it contains ground truth field names
    try:
        probe = json.loads(probe_response) if isinstance(probe_response, str) else probe_response
    except Exception:
        probe = {}

    body_keys = list(action.get("body", {}).keys())
    url = action.get("url", "/mock_api/orders")

    return {
        "order_field":         probe.get("order_field", body_keys[0] if body_keys else "qty"),
        "required_extra":      probe.get("required_extra", None),
        "claims_id_field":     probe.get("claims_id_field", None),
        "claims_amount_field": probe.get("claims_amount_field", None),
        "claims_endpoint":     probe.get("claims_endpoint", None),
        "endpoint":            url,
        "rooms_endpoint":      probe.get("rooms_endpoint", None),
        "drift_detected":      probe.get("order_field") != (body_keys[0] if body_keys else "qty"),
        "order_status":        probe.get("order_status", None),
        "order_status_value":  probe.get("order_status", None),
        "product_key":         probe.get("product_key", "products"),
        "price_field":         probe.get("product_price_field", "price"),
        "product_price_field": probe.get("product_price_field", "price"),
        "id_field":            probe.get("product_id_field", "id"),
        "product_id_field":    probe.get("product_id_field", "id"),
        "membership_tier":     probe.get("membership_tier", None),
        "discount_rate":       probe.get("discount_rate", None),
    }


def run_full_episode(completion: str, scenario: str, difficulty: str = 'easy') -> tuple:
    task_action = parse_action(completion)

    try:
        start_data = env_start_episode(task_action, difficulty=difficulty)
    except Exception:
        return 0.0, 0.0, 0.0

    session_id       = start_data['session_id']
    probe_response   = start_data.get('probe_response', '')
    history_response = start_data.get('history_response', '')

    # Parse probe to fix task_action with correct field names
    try:
        probe = json.loads(probe_response) if isinstance(probe_response, str) else probe_response
    except Exception:
        probe = {}

    # Correct the task action using probe evidence
    if probe.get("order_field") and "orders" in task_action.get("url", ""):
        correct_field = probe["order_field"]
        old_body = task_action.get("body", {})
        corrected_body = {}
        for k, v in old_body.items():
            if k in ("qty", "quantity", "amount", "count"):
                corrected_body[correct_field] = v
            else:
                corrected_body[k] = v
        task_action = {**task_action, "body": corrected_body}

    belief_state = generate_belief(completion, probe_response, history_response)

    try:
        result = env_finish_episode(session_id, task_action, belief_state)
    except Exception:
        return 0.0, 0.0, 0.0

    task_r   = float(result.get('task_reward',    0.0))
    belief_r = float(result.get('belief_accuracy', 0.0))
    reward   = task_r * (1.0 + 0.6 * belief_r)
    if task_r > 0.4 and belief_r < 0.2:
        reward *= 0.6
    return task_r, belief_r, float(np.clip(reward, -1.0, 1.0))


def reward_fn(completions, prompts, **kwargs):
    global TRAINING_STEP
    TRAINING_STEP += 1

    if TRAINING_STEP < 30:    difficulty = 'easy'
    elif TRAINING_STEP < 60:  difficulty = 'easy' if TRAINING_STEP % 3 != 0 else 'medium'
    elif TRAINING_STEP < 80:  difficulty = 'medium'
    else:                     difficulty = 'hard'

    rewards, step_tasks, step_beliefs = [], [], []

    for completion, prompt in zip(completions, prompts):
        scenario = ''
        try:
            m = re.search(r'<\|im_start\|>user\n(.*?)<\|im_end\|>', str(prompt), re.DOTALL)
            if m: scenario = m.group(1).strip()
        except Exception:
            pass

        task_r, belief_r, combined = run_full_episode(completion, scenario, difficulty)
        r = float(np.clip(combined, -1.0, 1.0))
        task_rewards_log.append(task_r)
        belief_accuracy_log.append(belief_r)
        combined_rewards_log.append(r)
        training_steps_log.append(TRAINING_STEP)
        step_tasks.append(task_r)
        step_beliefs.append(belief_r)
        rewards.append(r)

    if TRAINING_STEP % 10 == 0:
        print(f'  [Step {TRAINING_STEP:3d} | {difficulty:6s}] '
              f'task={np.mean(step_tasks):.3f} | '
              f'belief={np.mean(step_beliefs):.3f} | '
              f'combined={np.mean(rewards):.3f}')

    return rewards

print('✓ Reward function ready')


SYSTEM_PROMPT = """You are an agent completing professional API tasks.
The API world may mutate mid-episode WITHOUT warning. You will NOT be told when drift occurs.

You MUST output a single JSON object with ALL of these fields:
{
  "action_type": "call_api",
  "method": "POST",
  "url": "/mock_api/orders",
  "body": {"qty": 2, "product_id": 5},
  "belief_state": {
    "drift_detected": false,
    "field_name": "qty",
    "endpoint": "/mock_api/orders",
    "what_changed": "nothing yet"
  }
}

Rules:
- belief_state is REQUIRED — missing it costs you points
- drift_detected must be true if you suspect any field/endpoint changed
- what_changed must describe what you think changed
- Output ONLY valid JSON. No markdown. No explanation."""

SCENARIO_PROMPTS = [
    "Place an order for product_id 5, quantity 2. POST /mock_api/orders. The quantity field name may change after your first call.",
    "Book a standard room for 2 nights. POST /mock_api/rooms/book. The booking endpoint may version-bump mid-task.",
    "Search flights BOM to DEL departing tomorrow. GET /mock_api/flights/search. Auth scheme may change from Bearer to ApiKey.",
    "File insurance claim for policy_id 1234, amount 5000. POST /mock_api/claims. Both endpoint and field names may change.",
    "Apply discount code SAVE20. POST /mock_api/discount/apply. A membership_tier policy requirement may appear.",
    "Book 1 conference room for 2 nights. POST /mock_api/rooms/book. Rate limits or endpoint may change.",
    "Search for electronics products. GET /mock_api/products. Response keys may rename (products to items, price to cost).",
]

def make_prompts(n=200):
    data = []
    for i in range(n):
        scenario = SCENARIO_PROMPTS[i % len(SCENARIO_PROMPTS)]
        data.append({
            "prompt": f"<|im_start|>system\n{SYSTEM_PROMPT}<|im_end|>\n<|im_start|>user\n{scenario}<|im_end|>\n<|im_start|>assistant\n"
        })
    return data

prompts_dataset = Dataset.from_list(make_prompts(200))
print(f"✓ Dataset: {len(prompts_dataset)} prompts")


print(f"Loading {MODEL_NAME}...")
MAX_SEQ_LEN = 1024

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_NAME,
    max_seq_length=MAX_SEQ_LEN,
    dtype=torch.float16,
    load_in_4bit=False,
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=42,
)

trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total     = sum(p.numel() for p in model.parameters())
print(f"✓ Loaded at float16, LoRA applied")
print(f"  Trainable: {trainable/1e6:.1f}M / {total/1e9:.2f}B ({100*trainable/total:.2f}%)")


training_args = GRPOConfig(
    temperature=0.8,
    learning_rate=5e-6,
    weight_decay=0.01,
    warmup_ratio=0.05,
    lr_scheduler_type="cosine",
    optim="adamw_8bit",
    max_grad_norm=0.3,
    logging_steps=5,
    output_dir="adaptive-world-grpo",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_generations=6,
    max_prompt_length=512,
    max_completion_length=200,
    max_steps=100,  # reduced: 3-phase is ~2x slower
    save_steps=50,
    fp16=True,
    dataloader_num_workers=0,
    remove_unused_columns=False,
)

trainer = GRPOTrainer(
    model=model,
    processing_class=tokenizer,
    reward_funcs=reward_fn,
    args=training_args,
    train_dataset=prompts_dataset,
)

print("✓ Trainer ready")
print(f"  max_steps={training_args.max_steps} | num_generations={training_args.num_generations} | lr={training_args.learning_rate}")


test = '{"action_type": "call_api", "method": "POST", "url": "/mock_api/orders", "body": {"qty": 2, "product_id": 5}, "belief_state": {"drift_detected": false, "field_name": "qty", "endpoint": "/mock_api/orders", "what_changed": "nothing yet"}}'
task_r, belief_r, combined = run_full_episode(test, scenario="Place an order for product_id 5, quantity 2.", difficulty="easy")
print(f"task={task_r}  belief={belief_r}  combined={combined}")


import httpx, json

test_action = {"action_type": "call_api", "method": "POST", 
               "url": "/mock_api/orders", "body": {"qty": 2, "product_id": 5}}

with httpx.Client(base_url=ENV_URL, timeout=60) as c:
    r = c.post("/start_episode", json={
        "scenario_id": "auto",
        "difficulty": "easy", 
        "task_action": test_action,
    })
    start_data = r.json()

print("session_id:", start_data.get("session_id"))
print("probe_response:", json.dumps(start_data.get("probe_response"), indent=2))
print("history_response:", json.dumps(start_data.get("history_response"), indent=2))

# Now finish with correct fields
belief = {
    "order_field": "qty",
    "drift_detected": False,
    "endpoint": "/mock_api/orders",
}
with httpx.Client(base_url=ENV_URL, timeout=60) as c:
    r = c.post("/finish_episode", json={
        "session_id": start_data["session_id"],
        "task_action": test_action,
        "belief_state": belief,
    })
    result = r.json()

print("result:", json.dumps(result, indent=2))


task_rewards_log     = []
belief_accuracy_log  = []
combined_rewards_log = []
training_steps_log   = []
TRAINING_STEP = 0

print("✓ Tracking initialized")


print("Starting GRPO training...")
print("Live updates every 10 steps")
print("-" * 60)

trainer.train()

print("-" * 60)
print("✓ Training complete")
print(f"  task (last 20):   {np.mean(task_rewards_log[-20:]):.4f}")
print(f"  belief (last 20): {np.mean(belief_accuracy_log[-20:]):.4f}")


def smooth(data, window=20):
    if len(data) < window:
        return np.array(data)
    return np.convolve(data, np.ones(window)/window, mode="valid")

fig, axes = plt.subplots(1, 3, figsize=(18, 5))
fig.suptitle("AdaptiveWorld GRPO Training", fontsize=14, fontweight="bold")

steps = np.arange(len(task_rewards_log))

ax1 = axes[0]
ax1.plot(steps, task_rewards_log, alpha=0.2, color="steelblue")
if len(task_rewards_log) >= 20:
    ax1.plot(np.arange(len(smooth(task_rewards_log))), smooth(task_rewards_log), color="steelblue", linewidth=2)
ax1.axhline(0.08, color="red", linestyle="--", alpha=0.6, label="baseline")
ax1.set_title("Task Completion Rate", fontweight="bold")
ax1.set_xlabel("Step"); ax1.set_ylabel("task_reward")
ax1.set_ylim(0, 1.0); ax1.legend(); ax1.grid(alpha=0.3)

ax2 = axes[1]
ax2.plot(steps, belief_accuracy_log, alpha=0.2, color="darkorange")
if len(belief_accuracy_log) >= 20:
    ax2.plot(np.arange(len(smooth(belief_accuracy_log))), smooth(belief_accuracy_log), color="darkorange", linewidth=2)
ax2.axhline(0.12, color="red", linestyle="--", alpha=0.6, label="baseline")
ax2.set_title("Belief Accuracy", fontweight="bold")
ax2.set_xlabel("Step"); ax2.set_ylabel("belief_accuracy")
ax2.set_ylim(0, 1.0); ax2.legend(); ax2.grid(alpha=0.3)

ax3 = axes[2]
mid = len(task_rewards_log) // 2
if mid > 0:
    ax3.scatter(task_rewards_log[:mid], belief_accuracy_log[:mid], alpha=0.4, color="tomato", s=20, label="early")
if mid < len(task_rewards_log):
    ax3.scatter(task_rewards_log[mid:], belief_accuracy_log[mid:], alpha=0.4, color="seagreen", s=20, label="late")
if mid > 1:
    r_e = np.corrcoef(task_rewards_log[:mid], belief_accuracy_log[:mid])[0,1]
    r_l = np.corrcoef(task_rewards_log[mid:], belief_accuracy_log[mid:])[0,1]
    ax3.text(0.05, 0.95, f"early r={r_e:.2f}", transform=ax3.transAxes, color="tomato", fontsize=10)
    ax3.text(0.05, 0.88, f"late  r={r_l:.2f}", transform=ax3.transAxes, color="seagreen", fontsize=10)
ax3.set_title("Correlation: Task vs Belief", fontweight="bold")
ax3.set_xlabel("task_reward"); ax3.set_ylabel("belief_accuracy")
ax3.set_xlim(0, 1.0); ax3.set_ylim(0, 1.0); ax3.legend(); ax3.grid(alpha=0.3)

plt.tight_layout()
plt.savefig("training_curves.png", dpi=150, bbox_inches="tight")
plt.show()
print("✓ Saved: training_curves.png")


if task_rewards_log:
    n = len(task_rewards_log)
    q = max(1, n // 4)
    print("=" * 50)
    print("RESULTS")
    print("=" * 50)
    print(f"  task    before: {np.mean(task_rewards_log[:q]):.2%}  after: {np.mean(task_rewards_log[-q:]):.2%}")
    print(f"  belief  before: {np.mean(belief_accuracy_log[:q]):.2%}  after: {np.mean(belief_accuracy_log[-q:]):.2%}")
    if n > 4:
        mid = n // 2
        r_e = np.corrcoef(task_rewards_log[:mid], belief_accuracy_log[:mid])[0,1]
        r_l = np.corrcoef(task_rewards_log[mid:], belief_accuracy_log[mid:])[0,1]
        print(f"  corr    early: {r_e:.3f}   late: {r_l:.3f}")
    print("=" * 50)


def print_summary():
    if not task_rewards_log:
        print('No data. Run training first.')
        return
    n = len(task_rewards_log)
    q = max(1, n // 4)
    early_t = task_rewards_log[:q]
    late_t  = task_rewards_log[-q:]
    early_b = belief_accuracy_log[:q]
    late_b  = belief_accuracy_log[-q:]

    print('=' * 55)
    print('NUMBERS FOR Q&A / PITCH')
    print('=' * 55)
    print(f'  Task    — before: {np.mean(early_t):.2%}  →  after: {np.mean(late_t):.2%}')
    print(f'  Belief  — before: {np.mean(early_b):.2%}  →  after: {np.mean(late_b):.2%}')
    if n > 4:
        mid = n // 2
        r_e = np.corrcoef(task_rewards_log[:mid], belief_accuracy_log[:mid])[0,1]
        r_l = np.corrcoef(task_rewards_log[mid:], belief_accuracy_log[mid:])[0,1]
        print(f'  Corr    — early: {r_e:.3f}  →  late: {r_l:.3f}')
    print(f'  Points: {n} | Per step: {training_args.num_generations}')  # ← fixed: training_args not config
    print('=' * 55)

print_summary()


print('=== DEMO: H1 — Silent Semantic Drift ===')
print('This is the scenario that makes judges stop.')
print('API returns 200 OK. No error. But status values changed.')
print()


HF_REPO = 'ProthamD/adaptive-world-grpo-qwen2.5-3b'

# Uncomment to push:
# from huggingface_hub import login
# login()
# model.save_pretrained_merged(HF_REPO, tokenizer, save_method="merged_16bit")
# print(f'✓ Pushed to https://huggingface.co/{HF_REPO}')

# FIX: don't use model.push_to_hub() directly on LoRA model —
# use save_pretrained_merged to merge weights first, then push.
# model.push_to_hub() on a PEFT model pushes only adapters, not merged weights.
print(f'Will push to: {HF_REPO}')
print('Uncomment lines above. Uses save_pretrained_merged not push_to_hub.')