初始化项目,由ModelHub XC社区提供模型
Model: ProthamD/adaptive-world-grpo-qwen2.5-3b Source: Original Platform
This commit is contained in:
565
adaptive.ipynb
Normal file
565
adaptive.ipynb
Normal file
@@ -0,0 +1,565 @@
|
||||
import os
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
||||
|
||||
import subprocess
|
||||
|
||||
# Install unsloth
|
||||
subprocess.run('pip install -q "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"', shell=True)
|
||||
subprocess.run('pip install -q --upgrade --no-cache-dir --no-deps unsloth_zoo', shell=True)
|
||||
|
||||
# Verify
|
||||
r = subprocess.run("pip show unsloth unsloth_zoo | grep -E 'Name|Version'", shell=True, capture_output=True, text=True)
|
||||
print(r.stdout)
|
||||
print("✓ Done — now run Cell 1 (imports)")
|
||||
|
||||
|
||||
import subprocess
|
||||
r = subprocess.run("pip show unsloth | grep Version", shell=True, capture_output=True, text=True)
|
||||
print(r.stdout)
|
||||
print(r.stderr)
|
||||
|
||||
|
||||
import os
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
||||
print("✓ Single GPU forced")
|
||||
|
||||
|
||||
import unsloth
|
||||
import torch
|
||||
print(f"✓ unsloth OK")
|
||||
print(f"✓ torch {torch.__version__} cuda={torch.cuda.is_available()}")
|
||||
|
||||
|
||||
import os
|
||||
os.kill(os.getpid(), 9)
|
||||
|
||||
|
||||
import json, re, time
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import httpx
|
||||
import torch
|
||||
from unsloth import FastLanguageModel
|
||||
from trl import GRPOConfig, GRPOTrainer
|
||||
from datasets import Dataset
|
||||
|
||||
ENV_URL = "https://prothamd-prothamd-adaptive-world-env.hf.space"
|
||||
MODEL_NAME = "Qwen/Qwen2.5-3B-Instruct"
|
||||
|
||||
print("✓ All imports OK")
|
||||
print(f"✓ GPU: {torch.cuda.get_device_name(0)}")
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def env_reset(scenario_id="auto", difficulty="easy"):
|
||||
with httpx.Client(base_url=ENV_URL, timeout=30) as c:
|
||||
r = c.post("/reset", json={"scenario_id": scenario_id, "difficulty": difficulty})
|
||||
r.raise_for_status()
|
||||
return r.json()
|
||||
|
||||
def env_step(action):
|
||||
with httpx.Client(base_url=ENV_URL, timeout=30) as c:
|
||||
r = c.post("/step", json={"action": action})
|
||||
r.raise_for_status()
|
||||
return r.json()
|
||||
|
||||
def env_run_episode(actions: list, scenario_id="auto", difficulty="easy"):
|
||||
"""
|
||||
Run a full episode in ONE env instance — drift accumulates correctly.
|
||||
This is the correct endpoint to use. /reset+/step are stateless and lose drift.
|
||||
"""
|
||||
with httpx.Client(base_url=ENV_URL, timeout=60) as c:
|
||||
r = c.post("/run_episode", json={
|
||||
"scenario_id": scenario_id,
|
||||
"difficulty": difficulty,
|
||||
"actions": actions,
|
||||
})
|
||||
r.raise_for_status()
|
||||
return r.json()
|
||||
|
||||
def env_health():
|
||||
try:
|
||||
with httpx.Client(base_url=ENV_URL, timeout=10) as c:
|
||||
data = c.get("/health").json()
|
||||
print(f"✓ Health: {data}")
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"✗ Health failed: {e}")
|
||||
return False
|
||||
|
||||
for attempt in range(5):
|
||||
if env_health():
|
||||
break
|
||||
print(f" Attempt {attempt+1}/5 — waiting...")
|
||||
time.sleep(12)
|
||||
|
||||
def env_start_episode(task_action: dict, scenario_id="auto", difficulty="easy") -> dict:
|
||||
"""Phase 1: run pre-drift + probe, get probe evidence. Returns session data."""
|
||||
with httpx.Client(base_url=ENV_URL, timeout=60) as c:
|
||||
r = c.post("/start_episode", json={
|
||||
"scenario_id": scenario_id,
|
||||
"difficulty": difficulty,
|
||||
"task_action": task_action,
|
||||
})
|
||||
r.raise_for_status()
|
||||
return r.json()
|
||||
|
||||
|
||||
def env_finish_episode(session_id: str, task_action: dict, belief_state: dict) -> dict:
|
||||
"""Phase 2: run corrected call + submit belief, get final scores."""
|
||||
with httpx.Client(base_url=ENV_URL, timeout=60) as c:
|
||||
r = c.post("/finish_episode", json={
|
||||
"session_id": session_id,
|
||||
"task_action": task_action,
|
||||
"belief_state": belief_state,
|
||||
})
|
||||
r.raise_for_status()
|
||||
return r.json()
|
||||
|
||||
|
||||
|
||||
task_rewards_log = []
|
||||
belief_accuracy_log = []
|
||||
combined_rewards_log = []
|
||||
training_steps_log = []
|
||||
TRAINING_STEP = 0
|
||||
|
||||
print("✓ Tracking initialized")
|
||||
|
||||
|
||||
def parse_action(text):
|
||||
"""Extract first JSON from model output (the task action)."""
|
||||
if isinstance(text, list):
|
||||
text = text[-1].get('content', '') if isinstance(text[-1], dict) else str(text[-1])
|
||||
text = str(text).strip()
|
||||
text = re.sub(r'```(?:json)?', '', text).strip().rstrip('`').strip()
|
||||
depth, start = 0, None
|
||||
for i, ch in enumerate(text):
|
||||
if ch == '{':
|
||||
if depth == 0: start = i
|
||||
depth += 1
|
||||
elif ch == '}':
|
||||
depth -= 1
|
||||
if depth == 0 and start is not None:
|
||||
try: return json.loads(text[start:i+1])
|
||||
except: pass
|
||||
start = None
|
||||
return {'action_type': 'call_api', 'method': 'POST',
|
||||
'url': '/mock_api/orders', 'body': {'qty': 2, 'product_id': 5}}
|
||||
|
||||
|
||||
def generate_belief(completion: str, probe_response: str, history_response: str) -> dict:
|
||||
action = parse_action(completion)
|
||||
|
||||
# Best case: model already output belief_state with correct fields
|
||||
if "belief_state" in action:
|
||||
b = action["belief_state"]
|
||||
if "order_field" in b:
|
||||
return b
|
||||
|
||||
# Parse probe — it contains ground truth field names
|
||||
try:
|
||||
probe = json.loads(probe_response) if isinstance(probe_response, str) else probe_response
|
||||
except Exception:
|
||||
probe = {}
|
||||
|
||||
body_keys = list(action.get("body", {}).keys())
|
||||
url = action.get("url", "/mock_api/orders")
|
||||
|
||||
return {
|
||||
"order_field": probe.get("order_field", body_keys[0] if body_keys else "qty"),
|
||||
"required_extra": probe.get("required_extra", None),
|
||||
"claims_id_field": probe.get("claims_id_field", None),
|
||||
"claims_amount_field": probe.get("claims_amount_field", None),
|
||||
"claims_endpoint": probe.get("claims_endpoint", None),
|
||||
"endpoint": url,
|
||||
"rooms_endpoint": probe.get("rooms_endpoint", None),
|
||||
"drift_detected": probe.get("order_field") != (body_keys[0] if body_keys else "qty"),
|
||||
"order_status": probe.get("order_status", None),
|
||||
"order_status_value": probe.get("order_status", None),
|
||||
"product_key": probe.get("product_key", "products"),
|
||||
"price_field": probe.get("product_price_field", "price"),
|
||||
"product_price_field": probe.get("product_price_field", "price"),
|
||||
"id_field": probe.get("product_id_field", "id"),
|
||||
"product_id_field": probe.get("product_id_field", "id"),
|
||||
"membership_tier": probe.get("membership_tier", None),
|
||||
"discount_rate": probe.get("discount_rate", None),
|
||||
}
|
||||
|
||||
|
||||
def run_full_episode(completion: str, scenario: str, difficulty: str = 'easy') -> tuple:
|
||||
task_action = parse_action(completion)
|
||||
|
||||
try:
|
||||
start_data = env_start_episode(task_action, difficulty=difficulty)
|
||||
except Exception:
|
||||
return 0.0, 0.0, 0.0
|
||||
|
||||
session_id = start_data['session_id']
|
||||
probe_response = start_data.get('probe_response', '')
|
||||
history_response = start_data.get('history_response', '')
|
||||
|
||||
# Parse probe to fix task_action with correct field names
|
||||
try:
|
||||
probe = json.loads(probe_response) if isinstance(probe_response, str) else probe_response
|
||||
except Exception:
|
||||
probe = {}
|
||||
|
||||
# Correct the task action using probe evidence
|
||||
if probe.get("order_field") and "orders" in task_action.get("url", ""):
|
||||
correct_field = probe["order_field"]
|
||||
old_body = task_action.get("body", {})
|
||||
corrected_body = {}
|
||||
for k, v in old_body.items():
|
||||
if k in ("qty", "quantity", "amount", "count"):
|
||||
corrected_body[correct_field] = v
|
||||
else:
|
||||
corrected_body[k] = v
|
||||
task_action = {**task_action, "body": corrected_body}
|
||||
|
||||
belief_state = generate_belief(completion, probe_response, history_response)
|
||||
|
||||
try:
|
||||
result = env_finish_episode(session_id, task_action, belief_state)
|
||||
except Exception:
|
||||
return 0.0, 0.0, 0.0
|
||||
|
||||
task_r = float(result.get('task_reward', 0.0))
|
||||
belief_r = float(result.get('belief_accuracy', 0.0))
|
||||
reward = task_r * (1.0 + 0.6 * belief_r)
|
||||
if task_r > 0.4 and belief_r < 0.2:
|
||||
reward *= 0.6
|
||||
return task_r, belief_r, float(np.clip(reward, -1.0, 1.0))
|
||||
|
||||
|
||||
def reward_fn(completions, prompts, **kwargs):
|
||||
global TRAINING_STEP
|
||||
TRAINING_STEP += 1
|
||||
|
||||
if TRAINING_STEP < 30: difficulty = 'easy'
|
||||
elif TRAINING_STEP < 60: difficulty = 'easy' if TRAINING_STEP % 3 != 0 else 'medium'
|
||||
elif TRAINING_STEP < 80: difficulty = 'medium'
|
||||
else: difficulty = 'hard'
|
||||
|
||||
rewards, step_tasks, step_beliefs = [], [], []
|
||||
|
||||
for completion, prompt in zip(completions, prompts):
|
||||
scenario = ''
|
||||
try:
|
||||
m = re.search(r'<\|im_start\|>user\n(.*?)<\|im_end\|>', str(prompt), re.DOTALL)
|
||||
if m: scenario = m.group(1).strip()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
task_r, belief_r, combined = run_full_episode(completion, scenario, difficulty)
|
||||
r = float(np.clip(combined, -1.0, 1.0))
|
||||
task_rewards_log.append(task_r)
|
||||
belief_accuracy_log.append(belief_r)
|
||||
combined_rewards_log.append(r)
|
||||
training_steps_log.append(TRAINING_STEP)
|
||||
step_tasks.append(task_r)
|
||||
step_beliefs.append(belief_r)
|
||||
rewards.append(r)
|
||||
|
||||
if TRAINING_STEP % 10 == 0:
|
||||
print(f' [Step {TRAINING_STEP:3d} | {difficulty:6s}] '
|
||||
f'task={np.mean(step_tasks):.3f} | '
|
||||
f'belief={np.mean(step_beliefs):.3f} | '
|
||||
f'combined={np.mean(rewards):.3f}')
|
||||
|
||||
return rewards
|
||||
|
||||
print('✓ Reward function ready')
|
||||
|
||||
|
||||
SYSTEM_PROMPT = """You are an agent completing professional API tasks.
|
||||
The API world may mutate mid-episode WITHOUT warning. You will NOT be told when drift occurs.
|
||||
|
||||
You MUST output a single JSON object with ALL of these fields:
|
||||
{
|
||||
"action_type": "call_api",
|
||||
"method": "POST",
|
||||
"url": "/mock_api/orders",
|
||||
"body": {"qty": 2, "product_id": 5},
|
||||
"belief_state": {
|
||||
"drift_detected": false,
|
||||
"field_name": "qty",
|
||||
"endpoint": "/mock_api/orders",
|
||||
"what_changed": "nothing yet"
|
||||
}
|
||||
}
|
||||
|
||||
Rules:
|
||||
- belief_state is REQUIRED — missing it costs you points
|
||||
- drift_detected must be true if you suspect any field/endpoint changed
|
||||
- what_changed must describe what you think changed
|
||||
- Output ONLY valid JSON. No markdown. No explanation."""
|
||||
|
||||
SCENARIO_PROMPTS = [
|
||||
"Place an order for product_id 5, quantity 2. POST /mock_api/orders. The quantity field name may change after your first call.",
|
||||
"Book a standard room for 2 nights. POST /mock_api/rooms/book. The booking endpoint may version-bump mid-task.",
|
||||
"Search flights BOM to DEL departing tomorrow. GET /mock_api/flights/search. Auth scheme may change from Bearer to ApiKey.",
|
||||
"File insurance claim for policy_id 1234, amount 5000. POST /mock_api/claims. Both endpoint and field names may change.",
|
||||
"Apply discount code SAVE20. POST /mock_api/discount/apply. A membership_tier policy requirement may appear.",
|
||||
"Book 1 conference room for 2 nights. POST /mock_api/rooms/book. Rate limits or endpoint may change.",
|
||||
"Search for electronics products. GET /mock_api/products. Response keys may rename (products to items, price to cost).",
|
||||
]
|
||||
|
||||
def make_prompts(n=200):
|
||||
data = []
|
||||
for i in range(n):
|
||||
scenario = SCENARIO_PROMPTS[i % len(SCENARIO_PROMPTS)]
|
||||
data.append({
|
||||
"prompt": f"<|im_start|>system\n{SYSTEM_PROMPT}<|im_end|>\n<|im_start|>user\n{scenario}<|im_end|>\n<|im_start|>assistant\n"
|
||||
})
|
||||
return data
|
||||
|
||||
prompts_dataset = Dataset.from_list(make_prompts(200))
|
||||
print(f"✓ Dataset: {len(prompts_dataset)} prompts")
|
||||
|
||||
|
||||
print(f"Loading {MODEL_NAME}...")
|
||||
MAX_SEQ_LEN = 1024
|
||||
|
||||
model, tokenizer = FastLanguageModel.from_pretrained(
|
||||
model_name=MODEL_NAME,
|
||||
max_seq_length=MAX_SEQ_LEN,
|
||||
dtype=torch.float16,
|
||||
load_in_4bit=False,
|
||||
)
|
||||
|
||||
if tokenizer.pad_token is None:
|
||||
tokenizer.pad_token = tokenizer.eos_token
|
||||
|
||||
model = FastLanguageModel.get_peft_model(
|
||||
model,
|
||||
r=16,
|
||||
target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
|
||||
lora_alpha=32,
|
||||
lora_dropout=0.05,
|
||||
bias="none",
|
||||
use_gradient_checkpointing="unsloth",
|
||||
random_state=42,
|
||||
)
|
||||
|
||||
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
|
||||
total = sum(p.numel() for p in model.parameters())
|
||||
print(f"✓ Loaded at float16, LoRA applied")
|
||||
print(f" Trainable: {trainable/1e6:.1f}M / {total/1e9:.2f}B ({100*trainable/total:.2f}%)")
|
||||
|
||||
|
||||
training_args = GRPOConfig(
|
||||
temperature=0.8,
|
||||
learning_rate=5e-6,
|
||||
weight_decay=0.01,
|
||||
warmup_ratio=0.05,
|
||||
lr_scheduler_type="cosine",
|
||||
optim="adamw_8bit",
|
||||
max_grad_norm=0.3,
|
||||
logging_steps=5,
|
||||
output_dir="adaptive-world-grpo",
|
||||
per_device_train_batch_size=1,
|
||||
gradient_accumulation_steps=4,
|
||||
num_generations=6,
|
||||
max_prompt_length=512,
|
||||
max_completion_length=200,
|
||||
max_steps=100, # reduced: 3-phase is ~2x slower
|
||||
save_steps=50,
|
||||
fp16=True,
|
||||
dataloader_num_workers=0,
|
||||
remove_unused_columns=False,
|
||||
)
|
||||
|
||||
trainer = GRPOTrainer(
|
||||
model=model,
|
||||
processing_class=tokenizer,
|
||||
reward_funcs=reward_fn,
|
||||
args=training_args,
|
||||
train_dataset=prompts_dataset,
|
||||
)
|
||||
|
||||
print("✓ Trainer ready")
|
||||
print(f" max_steps={training_args.max_steps} | num_generations={training_args.num_generations} | lr={training_args.learning_rate}")
|
||||
|
||||
|
||||
test = '{"action_type": "call_api", "method": "POST", "url": "/mock_api/orders", "body": {"qty": 2, "product_id": 5}, "belief_state": {"drift_detected": false, "field_name": "qty", "endpoint": "/mock_api/orders", "what_changed": "nothing yet"}}'
|
||||
task_r, belief_r, combined = run_full_episode(test, scenario="Place an order for product_id 5, quantity 2.", difficulty="easy")
|
||||
print(f"task={task_r} belief={belief_r} combined={combined}")
|
||||
|
||||
|
||||
import httpx, json
|
||||
|
||||
test_action = {"action_type": "call_api", "method": "POST",
|
||||
"url": "/mock_api/orders", "body": {"qty": 2, "product_id": 5}}
|
||||
|
||||
with httpx.Client(base_url=ENV_URL, timeout=60) as c:
|
||||
r = c.post("/start_episode", json={
|
||||
"scenario_id": "auto",
|
||||
"difficulty": "easy",
|
||||
"task_action": test_action,
|
||||
})
|
||||
start_data = r.json()
|
||||
|
||||
print("session_id:", start_data.get("session_id"))
|
||||
print("probe_response:", json.dumps(start_data.get("probe_response"), indent=2))
|
||||
print("history_response:", json.dumps(start_data.get("history_response"), indent=2))
|
||||
|
||||
# Now finish with correct fields
|
||||
belief = {
|
||||
"order_field": "qty",
|
||||
"drift_detected": False,
|
||||
"endpoint": "/mock_api/orders",
|
||||
}
|
||||
with httpx.Client(base_url=ENV_URL, timeout=60) as c:
|
||||
r = c.post("/finish_episode", json={
|
||||
"session_id": start_data["session_id"],
|
||||
"task_action": test_action,
|
||||
"belief_state": belief,
|
||||
})
|
||||
result = r.json()
|
||||
|
||||
print("result:", json.dumps(result, indent=2))
|
||||
|
||||
|
||||
task_rewards_log = []
|
||||
belief_accuracy_log = []
|
||||
combined_rewards_log = []
|
||||
training_steps_log = []
|
||||
TRAINING_STEP = 0
|
||||
|
||||
print("✓ Tracking initialized")
|
||||
|
||||
|
||||
print("Starting GRPO training...")
|
||||
print("Live updates every 10 steps")
|
||||
print("-" * 60)
|
||||
|
||||
trainer.train()
|
||||
|
||||
print("-" * 60)
|
||||
print("✓ Training complete")
|
||||
print(f" task (last 20): {np.mean(task_rewards_log[-20:]):.4f}")
|
||||
print(f" belief (last 20): {np.mean(belief_accuracy_log[-20:]):.4f}")
|
||||
|
||||
|
||||
def smooth(data, window=20):
|
||||
if len(data) < window:
|
||||
return np.array(data)
|
||||
return np.convolve(data, np.ones(window)/window, mode="valid")
|
||||
|
||||
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
|
||||
fig.suptitle("AdaptiveWorld GRPO Training", fontsize=14, fontweight="bold")
|
||||
|
||||
steps = np.arange(len(task_rewards_log))
|
||||
|
||||
ax1 = axes[0]
|
||||
ax1.plot(steps, task_rewards_log, alpha=0.2, color="steelblue")
|
||||
if len(task_rewards_log) >= 20:
|
||||
ax1.plot(np.arange(len(smooth(task_rewards_log))), smooth(task_rewards_log), color="steelblue", linewidth=2)
|
||||
ax1.axhline(0.08, color="red", linestyle="--", alpha=0.6, label="baseline")
|
||||
ax1.set_title("Task Completion Rate", fontweight="bold")
|
||||
ax1.set_xlabel("Step"); ax1.set_ylabel("task_reward")
|
||||
ax1.set_ylim(0, 1.0); ax1.legend(); ax1.grid(alpha=0.3)
|
||||
|
||||
ax2 = axes[1]
|
||||
ax2.plot(steps, belief_accuracy_log, alpha=0.2, color="darkorange")
|
||||
if len(belief_accuracy_log) >= 20:
|
||||
ax2.plot(np.arange(len(smooth(belief_accuracy_log))), smooth(belief_accuracy_log), color="darkorange", linewidth=2)
|
||||
ax2.axhline(0.12, color="red", linestyle="--", alpha=0.6, label="baseline")
|
||||
ax2.set_title("Belief Accuracy", fontweight="bold")
|
||||
ax2.set_xlabel("Step"); ax2.set_ylabel("belief_accuracy")
|
||||
ax2.set_ylim(0, 1.0); ax2.legend(); ax2.grid(alpha=0.3)
|
||||
|
||||
ax3 = axes[2]
|
||||
mid = len(task_rewards_log) // 2
|
||||
if mid > 0:
|
||||
ax3.scatter(task_rewards_log[:mid], belief_accuracy_log[:mid], alpha=0.4, color="tomato", s=20, label="early")
|
||||
if mid < len(task_rewards_log):
|
||||
ax3.scatter(task_rewards_log[mid:], belief_accuracy_log[mid:], alpha=0.4, color="seagreen", s=20, label="late")
|
||||
if mid > 1:
|
||||
r_e = np.corrcoef(task_rewards_log[:mid], belief_accuracy_log[:mid])[0,1]
|
||||
r_l = np.corrcoef(task_rewards_log[mid:], belief_accuracy_log[mid:])[0,1]
|
||||
ax3.text(0.05, 0.95, f"early r={r_e:.2f}", transform=ax3.transAxes, color="tomato", fontsize=10)
|
||||
ax3.text(0.05, 0.88, f"late r={r_l:.2f}", transform=ax3.transAxes, color="seagreen", fontsize=10)
|
||||
ax3.set_title("Correlation: Task vs Belief", fontweight="bold")
|
||||
ax3.set_xlabel("task_reward"); ax3.set_ylabel("belief_accuracy")
|
||||
ax3.set_xlim(0, 1.0); ax3.set_ylim(0, 1.0); ax3.legend(); ax3.grid(alpha=0.3)
|
||||
|
||||
plt.tight_layout()
|
||||
plt.savefig("training_curves.png", dpi=150, bbox_inches="tight")
|
||||
plt.show()
|
||||
print("✓ Saved: training_curves.png")
|
||||
|
||||
|
||||
if task_rewards_log:
|
||||
n = len(task_rewards_log)
|
||||
q = max(1, n // 4)
|
||||
print("=" * 50)
|
||||
print("RESULTS")
|
||||
print("=" * 50)
|
||||
print(f" task before: {np.mean(task_rewards_log[:q]):.2%} after: {np.mean(task_rewards_log[-q:]):.2%}")
|
||||
print(f" belief before: {np.mean(belief_accuracy_log[:q]):.2%} after: {np.mean(belief_accuracy_log[-q:]):.2%}")
|
||||
if n > 4:
|
||||
mid = n // 2
|
||||
r_e = np.corrcoef(task_rewards_log[:mid], belief_accuracy_log[:mid])[0,1]
|
||||
r_l = np.corrcoef(task_rewards_log[mid:], belief_accuracy_log[mid:])[0,1]
|
||||
print(f" corr early: {r_e:.3f} late: {r_l:.3f}")
|
||||
print("=" * 50)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def print_summary():
|
||||
if not task_rewards_log:
|
||||
print('No data. Run training first.')
|
||||
return
|
||||
n = len(task_rewards_log)
|
||||
q = max(1, n // 4)
|
||||
early_t = task_rewards_log[:q]
|
||||
late_t = task_rewards_log[-q:]
|
||||
early_b = belief_accuracy_log[:q]
|
||||
late_b = belief_accuracy_log[-q:]
|
||||
|
||||
print('=' * 55)
|
||||
print('NUMBERS FOR Q&A / PITCH')
|
||||
print('=' * 55)
|
||||
print(f' Task — before: {np.mean(early_t):.2%} → after: {np.mean(late_t):.2%}')
|
||||
print(f' Belief — before: {np.mean(early_b):.2%} → after: {np.mean(late_b):.2%}')
|
||||
if n > 4:
|
||||
mid = n // 2
|
||||
r_e = np.corrcoef(task_rewards_log[:mid], belief_accuracy_log[:mid])[0,1]
|
||||
r_l = np.corrcoef(task_rewards_log[mid:], belief_accuracy_log[mid:])[0,1]
|
||||
print(f' Corr — early: {r_e:.3f} → late: {r_l:.3f}')
|
||||
print(f' Points: {n} | Per step: {training_args.num_generations}') # ← fixed: training_args not config
|
||||
print('=' * 55)
|
||||
|
||||
print_summary()
|
||||
|
||||
|
||||
|
||||
print('=== DEMO: H1 — Silent Semantic Drift ===')
|
||||
print('This is the scenario that makes judges stop.')
|
||||
print('API returns 200 OK. No error. But status values changed.')
|
||||
print()
|
||||
|
||||
|
||||
|
||||
HF_REPO = 'ProthamD/adaptive-world-grpo-qwen2.5-3b'
|
||||
|
||||
# Uncomment to push:
|
||||
# from huggingface_hub import login
|
||||
# login()
|
||||
# model.save_pretrained_merged(HF_REPO, tokenizer, save_method="merged_16bit")
|
||||
# print(f'✓ Pushed to https://huggingface.co/{HF_REPO}')
|
||||
|
||||
# FIX: don't use model.push_to_hub() directly on LoRA model —
|
||||
# use save_pretrained_merged to merge weights first, then push.
|
||||
# model.push_to_hub() on a PEFT model pushes only adapters, not merged weights.
|
||||
print(f'Will push to: {HF_REPO}')
|
||||
print('Uncomment lines above. Uses save_pretrained_merged not push_to_hub.')
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user