DLM-NL2JSON-4B/eval/eval_example.py

"""
DLM-NL2JSON-4B — Evaluation Script (Simplified)

Evaluates the model on the provided test set using an OpenAI-compatible API endpoint.
Measures per-category exact match accuracy and average latency.

Usage:
    # Against vLLM / TensorRT-LLM served model
    python eval_example.py \
        --data test_data_lite_200.jsonl \
        --base-url http://your-server:8006/v1 \
        --model qwen3_4b_6th_norag \
        --api-key token-abc123 \
        --disable-thinking

    # Against OpenAI API (GPT-4o baseline)
    export OPENAI_API_KEY="sk-..."
    python eval_example.py \
        --data test_data_lite_200.jsonl \
        --model gpt-4o
"""

import json, re, time, argparse, os
from collections import Counter
from typing import Dict, Any, List

# ── Prompts ──────────────────────────────────────────────
# Import from prompts.py (must be in the same directory)
from prompts import (
    SYS_CSM_DEFAULT,
    SYS_CREDIT_DEFAULT,
    SYS_GIS_DEFAULT,
    SYS_ALP_DEFAULT,
    SYS_CPI_DEFAULT,
)

# ── Category → (special_token, system_prompt) ────────────
TASK_MAP = {
    0:  ("<TASK_ALP>",    SYS_ALP_DEFAULT),     # ALP-A (pattern)
    1:  ("<TASK_ALP>",    SYS_ALP_DEFAULT),     # ALP-B (flow)
    2:  ("<TASK_CSM>",    SYS_CSM_DEFAULT),     # CSM (consumer spending)
    3:  ("<TASK_CREDIT>", SYS_CREDIT_DEFAULT),  # CREDIT-Income
    4:  ("<TASK_CREDIT>", SYS_CREDIT_DEFAULT),  # CREDIT-Spending
    5:  ("<TASK_CREDIT>", SYS_CREDIT_DEFAULT),  # CREDIT-Loan/Default
    6:  ("<TASK_CPI>",    SYS_CPI_DEFAULT),     # CPI (business status)
    9:  ("<TASK_GIS>",    SYS_GIS_DEFAULT),     # GIS-Inflow
    10: ("<TASK_GIS>",    SYS_GIS_DEFAULT),     # GIS-Outflow
    11: ("<TASK_GIS>",    SYS_GIS_DEFAULT),     # GIS-Consumption
}

CAT_NAMES = {
    0: "ALP-A(ptrn)", 1: "ALP-B(flow)", 2: "CSM",
    3: "CREDIT-Income", 4: "CREDIT-Spending", 5: "CREDIT-Loan",
    6: "CPI", 9: "GIS-Inflow", 10: "GIS-Outflow", 11: "GIS-Consumption",
}

# ── Required keys per category (for comparison) ─────────
REQUIRED_KEYS = {
    0:  ["base_ym", "region_nm", "ptrn", "sex_cd", "age_cd", "category"],
    1:  ["base_ym", "region_nm", "flow_cd", "sex_cd", "age_cd", "category"],
    2:  ["base_ym", "region_nm", "industry_select", "sex_cd", "age_cd", "category"],
    3:  ["base_ym", "region_nm", "job_cd", "perc_cd", "sex_cd", "age_cd", "category"],
    4:  ["base_ym", "region_nm", "job_cd", "perc_cd", "sex_cd", "age_cd", "category"],
    5:  ["base_ym", "region_nm", "job_cd", "perc_cd", "sex_cd", "age_cd", "category"],
    6:  ["base_ym", "region_nm", "bzc_cd", "cp_cd", "enp_cd", "category"],
    9:  ["region_nm", "base_ym", "region_count", "category"],
    10: ["region_nm", "base_ym", "region_count", "category"],
    11: ["region_nm", "base_ym", "industry_category", "category"],
}


# ── Normalization helpers ────────────────────────────────
def norm_int_list(v):
    if not isinstance(v, list):
        return v
    out = []
    for x in v:
        try:
            out.append(int(float(str(x).strip())))
        except Exception:
            continue
    return sorted(set(out))


def norm_dict_of_lists(d):
    """Normalize industry_select or bzc_cd: {str_key: [int, ...]}"""
    if not isinstance(d, dict):
        return d
    return {str(k).upper() if len(str(k)) == 1 and str(k).isalpha() else str(k):
            norm_int_list(arr) if isinstance(arr, list) else arr
            for k, arr in d.items()}


def normalize(obj: Dict[str, Any], cat: int) -> Dict[str, Any]:
    """Normalize prediction/gold for fair comparison (summary excluded)."""
    o = dict(obj)
    o.pop("summary", None)

    for k in ["base_ym", "region_count", "category"]:
        if k in o and isinstance(o[k], str):
            try:
                o[k] = int(o[k])
            except ValueError:
                pass

    for k in ["sex_cd", "age_cd", "job_cd", "perc_cd", "ptrn",
              "industry_category", "cp_cd", "enp_cd"]:
        if k in o:
            o[k] = norm_int_list(o[k])

    if "flow_cd" in o and isinstance(o["flow_cd"], list):
        o["flow_cd"] = norm_int_list(o["flow_cd"])

    for k in ["industry_select", "bzc_cd"]:
        if k in o:
            o[k] = norm_dict_of_lists(o[k])

    if "region_count" in o:
        try:
            o["region_count"] = max(1, min(10, int(o["region_count"])))
        except (ValueError, TypeError):
            pass

    return o


def extract_first_json(text: str):
    start = text.find("{")
    if start == -1:
        return None
    depth = 0
    for i in range(start, len(text)):
        if text[i] == "{":
            depth += 1
        elif text[i] == "}":
            depth -= 1
            if depth == 0:
                return text[start:i + 1]
    return None


def compare(pred: Dict, gold: Dict, cat: int):
    req = REQUIRED_KEYS.get(cat, [])
    diff = {}
    for k in req:
        if pred.get(k, "<MISSING>") != gold.get(k, "<MISSING>"):
            diff[k] = {"pred": pred.get(k), "gold": gold.get(k)}
    return len(diff) == 0, diff


# ── Main ─────────────────────────────────────────────────
def main():
    ap = argparse.ArgumentParser(description="DLM-NL2JSON-4B Evaluation")
    ap.add_argument("--data", required=True, help="Test JSONL file path")
    ap.add_argument("--base-url", default=None, help="OpenAI-compatible base URL")
    ap.add_argument("--model", required=True, help="Model name")
    ap.add_argument("--api-key", default=os.environ.get("OPENAI_API_KEY", ""), help="API key")
    ap.add_argument("--disable-thinking", action="store_true",
                    help="Pass chat_template_kwargs to disable Qwen3 thinking mode")
    ap.add_argument("--max-tokens", type=int, default=512)
    ap.add_argument("--per-cat", type=int, default=999, help="Max samples per category")
    args = ap.parse_args()

    import openai
    client = openai.OpenAI(
        base_url=args.base_url or None,
        api_key=args.api_key or "dummy",
        timeout=60.0,
    )

    # Load test data
    with open(args.data, encoding="utf-8") as f:
        raw = [json.loads(line) for line in f]

    # Group by category and sample
    from collections import defaultdict
    by_cat = defaultdict(list)
    for item in raw:
        out = item["output"] if isinstance(item["output"], dict) else json.loads(item["output"])
        cat = out["category"]
        by_cat[cat].append({"input": item["input"], "gold": out})

    samples = []
    for cat in sorted(by_cat):
        items = by_cat[cat][:args.per_cat]
        samples.extend([(cat, ex) for ex in items])

    print(f"[INFO] Evaluating {len(samples)} samples across {len(by_cat)} categories\n")

    # Evaluate
    ok_counts, total_counts = Counter(), Counter()
    latency_sums = Counter()

    for idx, (cat, ex) in enumerate(samples, 1):
        user_in = ex["input"].strip()
        gold_norm = normalize(ex["gold"], cat)

        tag, sys_prompt = TASK_MAP[cat]
        messages = [
            {"role": "system", "content": sys_prompt},
            {"role": "user", "content": f"{tag}\n{user_in}"},
        ]

        kwargs = dict(model=args.model, messages=messages,
                      max_tokens=args.max_tokens, temperature=0.0)
        if args.disable_thinking:
            kwargs["extra_body"] = {"chat_template_kwargs": {"enable_thinking": False}}

        t0 = time.perf_counter()
        try:
            resp = client.chat.completions.create(**kwargs)
            gen = resp.choices[0].message.content
        except Exception as e:
            dt = time.perf_counter() - t0
            total_counts[cat] += 1
            latency_sums[cat] += dt
            print(f"[{idx:04d}] {CAT_NAMES.get(cat, cat)} | ERROR: {e}")
            continue

        dt = time.perf_counter() - t0
        total_counts[cat] += 1
        latency_sums[cat] += dt

        json_str = extract_first_json(gen) or gen.strip()
        try:
            pred_obj = json.loads(json_str)
        except json.JSONDecodeError:
            print(f"[{idx:04d}] {CAT_NAMES.get(cat, cat)} | PARSE_FAIL | {dt:.2f}s")
            continue

        pred_norm = normalize(pred_obj, cat)
        ok, diff = compare(pred_norm, gold_norm, cat)
        if ok:
            ok_counts[cat] += 1

        status = "OK" if ok else f"FAIL {list(diff.keys())}"
        print(f"[{idx:04d}] {CAT_NAMES.get(cat, cat)} | {status} | {dt:.2f}s")

    # Summary
    print("\n" + "=" * 50)
    print("EVALUATION SUMMARY")
    print("=" * 50)
    total_ok = total_all = 0
    for c in sorted(total_counts):
        ok = ok_counts[c]
        tot = total_counts[c]
        acc = ok / tot if tot else 0
        avg_lat = latency_sums[c] / tot if tot else 0
        total_ok += ok
        total_all += tot
        print(f"  {CAT_NAMES.get(c, c):20s}: {ok:4d}/{tot:4d}  acc={acc:.1%}  avg={avg_lat:.3f}s")

    overall_acc = total_ok / total_all if total_all else 0
    overall_lat = sum(latency_sums.values()) / total_all if total_all else 0
    print(f"  {'OVERALL':20s}: {total_ok:4d}/{total_all:4d}  acc={overall_acc:.1%}  avg={overall_lat:.3f}s")


if __name__ == "__main__":
    main()