# %% # ---------------------------------------------------------- # Custom Hugging-Face pipeline for the “bonus” split that refers to the existing models # Task id : quizbowl-bonus # Expected input keys : leadin, part, previous_parts ('text' and 'guess') # Must return : answer, confidence, explanation # ---------------------------------------------------------- import json_repair import torch from datasets import Dataset from loguru import logger from torch.nn import functional as F from tqdm.auto import tqdm from transformers import Pipeline, pipeline from transformers.models.llama.modeling_llama import LlamaForCausalLM from transformers.pipelines import PIPELINE_REGISTRY def format_part(number: int, text: str, guess: str) -> str: return f"\t * Part {number}: {text}\n\t * Model Guess: {guess}" system_prompt = """ You are a quizbowl player. Given the a leadin and your responses to the previous related parts, provide the answer, a brief (1-2 sentences) explanation to the provided question along with your confidence in the guess. The answer should be a single word or short phrase, and the explanation should be concise and relevant to the question. The answer should be formatted in the below JSON format: { "answer": str, "explanation": str, "confidence": float (0-1 in the steps of 0.01) "justification": str (optional justification for the confidence score) } The confidence should be a float between 0 and 1, representing your confidence in the answer. """ user_prompt_template = """ "Leadin: {leadin} Question: {part}"{image_note} What is being asked in the question? Provide a concise answer, a brief explanation, and your confidence in the guess along with justification.""" def _bonus_image_note(leadin_images, part_images) -> str: li = leadin_images or [] pi = part_images or [] if not li and not pi: return "" return ( f"\n\n[This bonus includes {len(li)} leadin image(s) and {len(pi)} part image(s); " "this text-only pipeline does not see pixels—use a VLM pipeline with " "`leadin_images` / `part_images`.]" ) def prepare_conversation(leadin, part, image_note: str = ""): messages = [ { "role": "system", "content": system_prompt, }, { "role": "user", "content": user_prompt_template.format( leadin=leadin, part=part, image_note=image_note ), }, ] return messages def parse_output_text(output_text: str): try: start_index = output_text.find("{") if start_index == -1: raise ValueError("No JSON object found in the output text.") output_text = output_text[start_index:] json_data = json_repair.loads(output_text) if isinstance(json_data, list): json_data = json_data[0] answer = json_data.get("answer", "").strip() explanation = json_data.get("explanation", "").strip() confidence = json_data.get("confidence", 0.0) except Exception as e: logger.warning( f"Error parsing JSON: {e.__class__.__name__} - {e}. Got:\n{output_text}" ) answer, explanation, confidence = "", "", 0.0 try: confidence = float(confidence) confidence = max(0.0, min(1.0, confidence)) except ValueError: logger.warning(f"Invalid confidence value: {confidence}. Defaulting to 0.0.") confidence = 0.0 return { "answer": answer, "explanation": explanation, "confidence": confidence, } def postprocess_response(output_text, scores=None): model_response = parse_output_text(output_text) # Compute a confidence score by averaging the max softmax probabilities over generated tokens. if scores is not None and len(scores) > 0: probs = [F.softmax(score, dim=-1).max().item() for score in scores] logit_confidence = float(sum(probs) / len(probs)) if probs else 0.0 model_response["confidence"] = ( model_response["confidence"] + logit_confidence ) / 2 return model_response class BonusPipeline(Pipeline): def __init__(self, model, tokenizer, **kwargs): super().__init__( model=model, tokenizer=tokenizer, **kwargs, ) self.tokenizer.padding_side = "left" self.tokenizer.pad_token = self.tokenizer.eos_token def _sanitize_parameters(self, **kwargs): # No additional parameters needed return {}, {}, {} def preprocess(self, inputs): batch_size = len(inputs["leadin"]) leadin_imgs = inputs.get("leadin_images") or [[] for _ in range(batch_size)] part_imgs = inputs.get("part_images") or [[] for _ in range(batch_size)] conversations = [] for i in range(batch_size): note = _bonus_image_note(leadin_imgs[i], part_imgs[i]) conversations.append( prepare_conversation(inputs["leadin"][i], inputs["part"][i], image_note=note) ) model_inputs = self.tokenizer.apply_chat_template( conversations, add_generation_prompt=True, tokenize=True, return_dict=True, padding=True, return_tensors="pt", ) return model_inputs def _forward(self, model_inputs): # Do not use output_scores=True: it materializes full-vocab logits each step and # routinely OOMs mid-size GPUs (e.g. T4). postprocess() only uses decoded text. with torch.no_grad(): full = self.model.generate( **model_inputs, max_new_tokens=64, ) input_length = model_inputs["input_ids"].shape[1] class _GenOut: __slots__ = ("sequences",) def __init__(self, sequences): self.sequences = sequences return _GenOut(full[:, input_length:]) def postprocess(self, model_outputs): output_texts = self.tokenizer.batch_decode( model_outputs.sequences, skip_special_tokens=True ) records = [] for output_text in output_texts: record = postprocess_response(output_text) records.append(record) return records PIPELINE_REGISTRY.register_pipeline( "quizbowl-bonus", pipeline_class=BonusPipeline, pt_model=LlamaForCausalLM, default={ "pt": ("meta-llama/Llama-3.2-3B-Instruct", "main"), }, type="text", ) # %% if __name__ == "__main__": import os import torch from transformers import BitsAndBytesConfig # Full precision (default): ``device_map="auto"`` only. # Tight GPU (e.g. HF Space T4 with an 8B checkpoint): ``LLAMA3_BONUS_4BIT=1 pip install bitsandbytes`` first. model_kwargs: dict = {"device_map": "auto"} if os.environ.get("LLAMA3_BONUS_4BIT", "").strip().lower() in ("1", "true", "yes", "on"): model_kwargs["quantization_config"] = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", ) pipe = pipeline("quizbowl-bonus", trust_remote_code=True, model_kwargs=model_kwargs) examples = [ { "leadin": "This is a leadin.", "part": "What is the capital of France?", }, { "leadin": "This is another leadin.", "part": "What is the largest planet in our solar system?", "previous_parts": [ {"text": "What is the smallest planet?", "guess": "Mercury"}, {"text": "What is the second smallest planet?", "guess": "Mars"}, ], }, { "leadin": "This is a leadin with no previous parts.", "part": "What is the chemical symbol for water?", "previous_parts": [], }, ] * 5 dataset = Dataset.from_list(examples) print("Dataset size:", len(dataset)) outputs = [] batch_size = 5 for batch in tqdm(dataset.batch(batch_size), desc="Processing batches"): output = pipe(batch, batch_size=batch_size) outputs.extend(output)