Holo1-7B/screenspot_eval.py

import argparse
import json
import math
import re
from io import BytesIO

import numpy as np
from datasets import load_dataset
from PIL.Image import Image
from PIL.Image import open as open_img
from tqdm import tqdm
from transformers import AutoModelForImageTextToText, AutoProcessor
from transformers.modeling_utils import PreTrainedModel
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
from transformers.processing_utils import ProcessorMixin

INSTRUCTION_LOCALIZATION: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
INSTRUCTION_LOCALIZATION_TOOLCALL: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."


def load_screenspot(dataset_id: str, subset: str = "test"):
    dataset = load_dataset(dataset_id)
    return dataset[subset]


def l1(dx: float, dy: float) -> float:
    """Return L1 length of a vector"""
    return abs(dx) + abs(dy)


def l2(dx: float, dy: float) -> float:
    """Return L2 length of a vector"""
    return (dx**2 + dy**2) ** 0.5


def point_to_rectangle_dist(x: float, y: float, rectangle: tuple, distance_type="L2"):
    """Compute the distance of a predicted point to the closest edge of the bbox. If the point is in the bbox, then return 0."""
    x1, y1, x2, y2 = rectangle  # x1,y1 is top-left, x2,y2 is bottom-right

    # Check if the point is inside the rectangle
    if x1 <= x <= x2 and y1 <= y <= y2:
        return 0

    # Calculate the closest point on the rectangle
    closest_x = max(x1, min(x, x2))
    closest_y = max(y1, min(y, y2))

    # Calculate the distance
    dx = x - closest_x
    dy = y - closest_y
    if distance_type == "L1":
        return l1(dx, dy)
    elif distance_type == "L2":
        return l2(dx, dy)
    else:
        raise ValueError("Invalid distance type. Use 'L1' or 'L2'.")


def is_in_bbox(bbox: tuple, x: float, y: float) -> bool:
    """Check if a point is inside a bounding box."""
    x_top_left, y_top_left, x_bottom_right, y_bottom_right = bbox
    return x_top_left <= x <= x_bottom_right and y_top_left <= y <= y_bottom_right


def assemble_message(image, instruction, use_tool_call: bool = True):
    system_message = {
        "role": "system",
        "content": '[{"name": "click_action", "description": "Click at specific coordinates on the screen.", "parameters": {"additionalProperties": false, "description": "Click at specific coordinates on the screen.", "properties": {"action": {"const": "click", "default": "click", "title": "Action", "type": "string"}, "x": {"description": "The x coordinate, number of pixels from the left edge.", "title": "X", "type": "integer"}, "y": {"description": "The y coordinate, number of pixels from the top edge.", "title": "Y", "type": "integer"}}, "required": ["action", "x", "y"], "title": "ClickAction", "type": "object"}, "strict": true}]',
    }

    user_message = {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": image,
            },
            {
                "type": "text",
                "text": f"{INSTRUCTION_LOCALIZATION_TOOLCALL if use_tool_call else INSTRUCTION_LOCALIZATION}\n{instruction}",
            },
        ],
    }

    messages = [system_message, user_message] if use_tool_call else [user_message]
    return messages


def do_smart_resize(image: Image, image_processor: ProcessorMixin) -> tuple[Image, int, int]:
    """Do a QWEN2.5-VL smart resize using parameters of an image-processor"""
    resized_height, resized_width = smart_resize(
        image.height,
        image.width,
        factor=image_processor.patch_size * image_processor.merge_size,
        min_pixels=image_processor.min_pixels,
        max_pixels=image_processor.max_pixels,
    )
    return image.resize(size=(resized_width, resized_height), resample=None), resized_height, resized_width


def inference(
    model: PreTrainedModel, processor: ProcessorMixin, dataset, smart_resize: bool = True, use_toolcall: bool = True
):
    """Gather raw inference results from the model"""
    results = []
    for i, sample in enumerate(tqdm(dataset, "running inference requests")):
        bbox = sample["bbox"]
        instruction = sample["instruction"]
        image = sample["image"]  # this seems to be a pnd , maybe jpg artifacts cause the difference?
        image_shape_raw = (image.height, image.width)
        message = assemble_message(image=image, instruction=instruction)

        # Preparation for inference
        if smart_resize:
            image, resized_height, resized_width = do_smart_resize(
                image=image, image_processor=processor.image_processor
            )
        else:
            resized_height, resized_width = image_shape_raw
        text = processor.apply_chat_template(message, tokenize=False, add_generation_prompt=True)

        # compress to JPEG, which is needed for highest possible performance
        buffer = BytesIO()
        image.convert("RGB").save(buffer, format="JPEG", quality=90)
        image = open_img(buffer)

        inputs = processor(
            text=[text],
            images=image,
            padding=True,
            return_tensors="pt",
        )
        inputs = inputs.to("cuda")

        # Inference: Generation of the output
        generated_ids = model.generate(**inputs, max_new_tokens=128, do_sample=False)
        generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
        output_text = processor.batch_decode(
            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
        )
        # print(output_text)
        if use_toolcall:
            try:
                content = json.loads(output_text[0])
                prediction_raw = f"Click({content['arguments']['x']}, {content['arguments']['y']})"
            except Exception as e:
                print(f"Error parsing tool call, using message content instead if available: {repr(e)}")
                prediction_raw = output_text[0]
        else:
            prediction_raw = output_text[0]

        results.append(
            {
                "sample_id": i,
                "ground_truth": tuple(bbox),
                "prediction_raw": prediction_raw,
                "image_shape_raw": image_shape_raw,
                "img_shape_processed": (resized_height, resized_width),
            }
        )
    return results


def get_sample_result(result: dict):
    """Postprocess a inference result and compute metrics for this sample."""
    raw_height, raw_width = result["image_shape_raw"]
    height, width = result["img_shape_processed"]
    has_resized_image = height != raw_height or width != raw_width
    try:
        bbox = result["ground_truth"]
        prediction_raw = result["prediction_raw"]
        match = re.match(r"Click\((\d+),\s*(\d+)\)", prediction_raw)
        assert match is not None
        predicted_x = float(match.group(1)) / width
        predicted_y = float(match.group(2)) / height

    except Exception as e:
        sample_metric = {
            "sample_id": result["sample_id"],
            "has_correct_format": False,
            "has_resized_image": has_resized_image,
            "click_in_box": False,
            "click_l1_dist_to_bbox": 2,  # Longest possible L1 distance in the unit square
            "click_l2_dist_to_bbox": math.sqrt(2),  # Longest possible L2 distance in the unit square
        }

    sample_metric = {
        "sample_id": result["sample_id"],
        "has_correct_format": True,
        "has_resized_image": has_resized_image,
        "click_in_box": True if is_in_bbox(bbox, x=predicted_x, y=predicted_y) else False,
        "click_l1_dist_to_bbox": point_to_rectangle_dist(
            predicted_x, predicted_y, bbox, "L1"
        ),  # Longest possible L1 distance in the unit square
        "click_l2_dist_to_bbox": point_to_rectangle_dist(
            predicted_x, predicted_y, bbox, "L2"
        ),  # Longest possible L2 distance in the unit square
    }
    return sample_metric


def aggregate_metrics(sample_metrics):
    """Aggregate per-sample metrics into metrics for the entire dataset."""
    aggregated_metrics = {}
    aggregated_metrics["click_accuracy"] = np.mean([r["click_in_box"] for r in sample_metrics])

    for threshold in [0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5]:
        aggregated_metrics[f"click_accuracy_p{threshold}"] = np.mean(
            [r["click_l2_dist_to_bbox"] < threshold for r in sample_metrics]
        )

    aggregated_metrics["avg_click_l1_dist_to_bbox"] = np.mean([r["click_l1_dist_to_bbox"] for r in sample_metrics])
    aggregated_metrics["avg_click_l2_dist_to_bbox"] = np.mean([r["click_l2_dist_to_bbox"] for r in sample_metrics])
    aggregated_metrics["format_accuracy"] = np.mean([r["has_correct_format"] for r in sample_metrics])
    aggregated_metrics["has_resized_image"] = np.mean([r["has_resized_image"] for r in sample_metrics])
    return aggregated_metrics


def evaluate_results(results: list[dict]):
    """Do evaluate based on the raw model outputs."""
    per_sample_metrics = []
    for result in results:
        metric_dict = get_sample_result(result)
        per_sample_metrics.append(metric_dict)
    aggregated = aggregate_metrics(per_sample_metrics)
    return aggregated


def main(
    model_id: str = "Hcompany/Holo1-3B",
    dataset_id: str = "rootsautomation/ScreenSpot",
    outfile: str = "results.json",
    use_toolcall: bool = True,
):
    model = AutoModelForImageTextToText.from_pretrained(model_id)
    processor = AutoProcessor.from_pretrained(model_id)
    dataset = load_screenspot(dataset_id)
    results = inference(model.cuda(), processor, dataset, use_toolcall=use_toolcall)
    metrics = evaluate_results(results)
    with open(outfile, "w") as fp:
        json.dump(metrics, fp)
    for metric, value in metrics.items():
        print(f"{metric}:\t{value}")


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Run the main function with model and dataset IDs.")

    parser.add_argument(
        "--model_id",
        type=str,
        default="Hcompany/Holo1-3B",
        help="The identifier for the model to use (default: Hcompany/Holo1-3B)",
    )

    parser.add_argument(
        "--dataset_id",
        type=str,
        default="rootsautomation/ScreenSpot",
        help="The identifier for the dataset to use (default: rootsautomation/ScreenSpot)",
    )

    parser.add_argument(
        "--outfile",
        type=str,
        default="result.json",
        help="Output json-file containing the aggregated metrics.",
    )

    parser.add_argument(
        "--use_toolcall",
        type=bool,
        default=True,
        help="Enable or disable tool call prompting",
    )
    args = parser.parse_args()
    main(model_id=args.model_id, dataset_id=args.dataset_id, outfile=args.outfile, use_toolcall=args.use_toolcall)