53 lines
1.6 KiB
Python
53 lines
1.6 KiB
Python
|
|
import json
|
||
|
|
from typing import Any, Literal
|
||
|
|
|
||
|
|
from pydantic import BaseModel
|
||
|
|
|
||
|
|
|
||
|
|
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
|
||
|
|
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
|
||
|
|
|
||
|
|
return [
|
||
|
|
{
|
||
|
|
"role": "user",
|
||
|
|
"content": [
|
||
|
|
{
|
||
|
|
"type": "image",
|
||
|
|
"image": image,
|
||
|
|
},
|
||
|
|
{"type": "text", "text": f"{guidelines}\n{instruction}"},
|
||
|
|
],
|
||
|
|
}
|
||
|
|
]
|
||
|
|
|
||
|
|
|
||
|
|
class ClickAction(BaseModel):
|
||
|
|
"""Click at specific coordinates on the screen."""
|
||
|
|
|
||
|
|
action: Literal["click"] = "click"
|
||
|
|
x: int
|
||
|
|
"""The x coordinate, number of pixels from the left edge."""
|
||
|
|
y: int
|
||
|
|
"""The y coordinate, number of pixels from the top edge."""
|
||
|
|
|
||
|
|
|
||
|
|
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
|
||
|
|
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
|
||
|
|
|
||
|
|
return [
|
||
|
|
{
|
||
|
|
"role": "system",
|
||
|
|
"content": json.dumps([ClickAction.model_json_schema()]),
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"role": "user",
|
||
|
|
"content": [
|
||
|
|
{
|
||
|
|
"type": "image",
|
||
|
|
"image": image,
|
||
|
|
},
|
||
|
|
{"type": "text", "text": f"{guidelines}\n{instruction}"},
|
||
|
|
],
|
||
|
|
},
|
||
|
|
]
|