template_bonus/llama3_bonus.py

# %%
# ----------------------------------------------------------
# Custom Hugging-Face pipeline for the “bonus” split that refers to the existing models
# Task id  :  quizbowl-bonus
# Expected input keys : leadin, part, previous_parts ('text' and 'guess')
# Must return        : answer, confidence, explanation
# ----------------------------------------------------------


import json_repair
import torch
from datasets import Dataset
from loguru import logger
from torch.nn import functional as F
from tqdm.auto import tqdm
from transformers import Pipeline, pipeline
from transformers.models.llama.modeling_llama import LlamaForCausalLM
from transformers.pipelines import PIPELINE_REGISTRY


def format_part(number: int, text: str, guess: str) -> str:
    return f"\t * Part {number}: {text}\n\t * Model Guess: {guess}"


system_prompt = """
You are a quizbowl player. Given the a leadin and your responses to the previous related parts, provide the answer, a brief (1-2 sentences) explanation to the provided question along with your confidence in the guess.
The answer should be a single word or short phrase, and the explanation should be concise and relevant to the question.
The answer should be formatted in the below JSON format:

{
    "answer": str,
    "explanation": str,
    "confidence": float (0-1 in the steps of 0.01)
    "justification": str (optional justification for the confidence score)
}
The confidence should be a float between 0 and 1, representing your confidence in the answer.
"""

user_prompt_template = """
"Leadin: {leadin}
Question: {part}"{image_note}
What is being asked in the question? Provide a concise answer, a brief explanation, and your confidence in the guess along with justification."""


def _bonus_image_note(leadin_images, part_images) -> str:
    li = leadin_images or []
    pi = part_images or []
    if not li and not pi:
        return ""
    return (
        f"\n\n[This bonus includes {len(li)} leadin image(s) and {len(pi)} part image(s); "
        "this text-only pipeline does not see pixels—use a VLM pipeline with "
        "`leadin_images` / `part_images`.]"
    )


def prepare_conversation(leadin, part, image_note: str = ""):
    messages = [
        {
            "role": "system",
            "content": system_prompt,
        },
        {
            "role": "user",
            "content": user_prompt_template.format(
                leadin=leadin, part=part, image_note=image_note
            ),
        },
    ]
    return messages


def parse_output_text(output_text: str):
    try:
        start_index = output_text.find("{")
        if start_index == -1:
            raise ValueError("No JSON object found in the output text.")
        output_text = output_text[start_index:]
        json_data = json_repair.loads(output_text)
        if isinstance(json_data, list):
            json_data = json_data[0]
        answer = json_data.get("answer", "").strip()
        explanation = json_data.get("explanation", "").strip()
        confidence = json_data.get("confidence", 0.0)
    except Exception as e:
        logger.warning(
            f"Error parsing JSON: {e.__class__.__name__} - {e}. Got:\n{output_text}"
        )
        answer, explanation, confidence = "", "", 0.0

    try:
        confidence = float(confidence)
        confidence = max(0.0, min(1.0, confidence))
    except ValueError:
        logger.warning(f"Invalid confidence value: {confidence}. Defaulting to 0.0.")
        confidence = 0.0
    return {
        "answer": answer,
        "explanation": explanation,
        "confidence": confidence,
    }


def postprocess_response(output_text, scores=None):
    model_response = parse_output_text(output_text)

    # Compute a confidence score by averaging the max softmax probabilities over generated tokens.
    if scores is not None and len(scores) > 0:
        probs = [F.softmax(score, dim=-1).max().item() for score in scores]
        logit_confidence = float(sum(probs) / len(probs)) if probs else 0.0
        model_response["confidence"] = (
            model_response["confidence"] + logit_confidence
        ) / 2

    return model_response


class BonusPipeline(Pipeline):
    def __init__(self, model, tokenizer, **kwargs):
        super().__init__(
            model=model,
            tokenizer=tokenizer,
            **kwargs,
        )
        self.tokenizer.padding_side = "left"
        self.tokenizer.pad_token = self.tokenizer.eos_token

    def _sanitize_parameters(self, **kwargs):
        # No additional parameters needed
        return {}, {}, {}

    def preprocess(self, inputs):
        batch_size = len(inputs["leadin"])
        leadin_imgs = inputs.get("leadin_images") or [[] for _ in range(batch_size)]
        part_imgs = inputs.get("part_images") or [[] for _ in range(batch_size)]
        conversations = []
        for i in range(batch_size):
            note = _bonus_image_note(leadin_imgs[i], part_imgs[i])
            conversations.append(
                prepare_conversation(inputs["leadin"][i], inputs["part"][i], image_note=note)
            )

        model_inputs = self.tokenizer.apply_chat_template(
            conversations,
            add_generation_prompt=True,
            tokenize=True,
            return_dict=True,
            padding=True,
            return_tensors="pt",
        )
        return model_inputs

    def _forward(self, model_inputs):
        # Do not use output_scores=True: it materializes full-vocab logits each step and
        # routinely OOMs mid-size GPUs (e.g. T4). postprocess() only uses decoded text.
        with torch.no_grad():
            full = self.model.generate(
                **model_inputs,
                max_new_tokens=64,
            )
        input_length = model_inputs["input_ids"].shape[1]

        class _GenOut:
            __slots__ = ("sequences",)

            def __init__(self, sequences):
                self.sequences = sequences

        return _GenOut(full[:, input_length:])

    def postprocess(self, model_outputs):
        output_texts = self.tokenizer.batch_decode(
            model_outputs.sequences, skip_special_tokens=True
        )
        records = []

        for output_text in output_texts:
            record = postprocess_response(output_text)
            records.append(record)
        return records


PIPELINE_REGISTRY.register_pipeline(
    "quizbowl-bonus",
    pipeline_class=BonusPipeline,
    pt_model=LlamaForCausalLM,
    default={
        "pt": ("meta-llama/Llama-3.2-3B-Instruct", "main"),
    },
    type="text",
)
# %%
if __name__ == "__main__":
    import os

    import torch
    from transformers import BitsAndBytesConfig

    # Full precision (default): ``device_map="auto"`` only.
    # Tight GPU (e.g. HF Space T4 with an 8B checkpoint): ``LLAMA3_BONUS_4BIT=1 pip install bitsandbytes`` first.
    model_kwargs: dict = {"device_map": "auto"}
    if os.environ.get("LLAMA3_BONUS_4BIT", "").strip().lower() in ("1", "true", "yes", "on"):
        model_kwargs["quantization_config"] = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.bfloat16,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
        )

    pipe = pipeline("quizbowl-bonus", trust_remote_code=True, model_kwargs=model_kwargs)

    examples = [
        {
            "leadin": "This is a leadin.",
            "part": "What is the capital of France?",
        },
        {
            "leadin": "This is another leadin.",
            "part": "What is the largest planet in our solar system?",
            "previous_parts": [
                {"text": "What is the smallest planet?", "guess": "Mercury"},
                {"text": "What is the second smallest planet?", "guess": "Mars"},
            ],
        },
        {
            "leadin": "This is a leadin with no previous parts.",
            "part": "What is the chemical symbol for water?",
            "previous_parts": [],
        },
    ] * 5

    dataset = Dataset.from_list(examples)

    print("Dataset size:", len(dataset))
    outputs = []
    batch_size = 5
    for batch in tqdm(dataset.batch(batch_size), desc="Processing batches"):
        output = pipe(batch, batch_size=batch_size)
        outputs.extend(output)