187 lines
7.4 KiB
Python
187 lines
7.4 KiB
Python
|
|
from typing import Literal
|
||
|
|
|
||
|
|
from pydantic import BaseModel, Field
|
||
|
|
|
||
|
|
SYSTEM_PROMPT: str = """Imagine you are a robot browsing the web, just like humans. Now you need to complete a task.
|
||
|
|
In each iteration, you will receive an Observation that includes the last screenshots of a web browser and the current memory of the agent.
|
||
|
|
You have also information about the step that the agent is trying to achieve to solve the task.
|
||
|
|
Carefully analyze the visual information to identify what to do, then follow the guidelines to choose the following action.
|
||
|
|
You should detail your thought (i.e. reasoning steps) before taking the action.
|
||
|
|
Also detail in the notes field of the action the extracted information relevant to solve the task.
|
||
|
|
Once you have enough information in the notes to answer the task, return an answer action with the detailed answer in the notes field.
|
||
|
|
This will be evaluated by an evaluator and should match all the criteria or requirements of the task.
|
||
|
|
|
||
|
|
Guidelines:
|
||
|
|
- store in the notes all the relevant information to solve the task that fulfill the task criteria. Be precise
|
||
|
|
- Use both the task and the step information to decide what to do
|
||
|
|
- if you want to write in a text field and the text field already has text, designate the text field by the text it contains and its type
|
||
|
|
- If there is a cookies notice, always accept all the cookies first
|
||
|
|
- The observation is the screenshot of the current page and the memory of the agent.
|
||
|
|
- If you see relevant information on the screenshot to answer the task, add it to the notes field of the action.
|
||
|
|
- If there is no relevant information on the screenshot to answer the task, add an empty string to the notes field of the action.
|
||
|
|
- If you see buttons that allow to navigate directly to relevant information, like jump to ... or go to ... , use them to navigate faster.
|
||
|
|
- In the answer action, give as many details a possible relevant to answering the task.
|
||
|
|
- if you want to write, don't click before. Directly use the write action
|
||
|
|
- to write, identify the web element which is type and the text it already contains
|
||
|
|
- If you want to use a search bar, directly write text in the search bar
|
||
|
|
- Don't scroll too much. Don't scroll if the number of scrolls is greater than 3
|
||
|
|
- Don't scroll if you are at the end of the webpage
|
||
|
|
- Only refresh if you identify a rate limit problem
|
||
|
|
- If you are looking for a single flights, click on round-trip to select 'one way'
|
||
|
|
- Never try to login, enter email or password. If there is a need to login, then go back.
|
||
|
|
- If you are facing a captcha on a website, try to solve it.
|
||
|
|
|
||
|
|
- if you have enough information in the screenshot and in the notes to answer the task, return an answer action with the detailed answer in the notes field
|
||
|
|
- The current date is {timestamp}.
|
||
|
|
|
||
|
|
# <output_json_format>
|
||
|
|
# ```json
|
||
|
|
# {output_format}
|
||
|
|
# ```
|
||
|
|
# </output_json_format>
|
||
|
|
|
||
|
|
"""
|
||
|
|
|
||
|
|
|
||
|
|
class ClickElementAction(BaseModel):
|
||
|
|
"""Click at absolute coordinates of a web element with its description"""
|
||
|
|
|
||
|
|
action: Literal["click_element"] = Field(description="Click at absolute coordinates of a web element")
|
||
|
|
element: str = Field(description="text description of the element")
|
||
|
|
x: int = Field(description="The x coordinate, number of pixels from the left edge.")
|
||
|
|
y: int = Field(description="The y coordinate, number of pixels from the top edge.")
|
||
|
|
|
||
|
|
def log(self):
|
||
|
|
return f"I have clicked on the element '{self.element}' at absolute coordinates {self.x}, {self.y}"
|
||
|
|
|
||
|
|
|
||
|
|
class WriteElementAction(BaseModel):
|
||
|
|
"""Write content at absolute coordinates of a web element identified by its description, then press Enter."""
|
||
|
|
|
||
|
|
action: Literal["write_element_abs"] = Field(description="Write content at absolute coordinates of a web page")
|
||
|
|
content: str = Field(description="Content to write")
|
||
|
|
element: str = Field(description="Text description of the element")
|
||
|
|
x: int = Field(description="The x coordinate, number of pixels from the left edge.")
|
||
|
|
y: int = Field(description="The y coordinate, number of pixels from the top edge.")
|
||
|
|
|
||
|
|
def log(self):
|
||
|
|
return f"I have written '{self.content}' in the element '{self.element}' at absolute coordinates {self.x}, {self.y}"
|
||
|
|
|
||
|
|
|
||
|
|
class ScrollAction(BaseModel):
|
||
|
|
"""Scroll action with no required element"""
|
||
|
|
|
||
|
|
action: Literal["scroll"] = Field(description="Scroll the page or a specific element")
|
||
|
|
direction: Literal["down", "up", "left", "right"] = Field(description="The direction to scroll in")
|
||
|
|
|
||
|
|
def log(self):
|
||
|
|
return f"I have scrolled {self.direction}"
|
||
|
|
|
||
|
|
|
||
|
|
class GoBackAction(BaseModel):
|
||
|
|
"""Action to navigate back in browser history"""
|
||
|
|
|
||
|
|
action: Literal["go_back"] = Field(description="Navigate to the previous page")
|
||
|
|
|
||
|
|
def log(self):
|
||
|
|
return "I have gone back to the previous page"
|
||
|
|
|
||
|
|
|
||
|
|
class RefreshAction(BaseModel):
|
||
|
|
"""Action to refresh the current page"""
|
||
|
|
|
||
|
|
action: Literal["refresh"] = Field(description="Refresh the current page")
|
||
|
|
|
||
|
|
def log(self):
|
||
|
|
return "I have refreshed the page"
|
||
|
|
|
||
|
|
|
||
|
|
class GotoAction(BaseModel):
|
||
|
|
"""Action to go to a particular URL"""
|
||
|
|
|
||
|
|
action: Literal["goto"] = Field(description="Goto a particular URL")
|
||
|
|
url: str = Field(description="A url starting with http:// or https://")
|
||
|
|
|
||
|
|
def log(self):
|
||
|
|
return f"I have navigated to the URL {self.url}"
|
||
|
|
|
||
|
|
|
||
|
|
class WaitAction(BaseModel):
|
||
|
|
"""Action to wait for a particular amount of time"""
|
||
|
|
|
||
|
|
action: Literal["wait"] = Field(description="Wait for a particular amount of time")
|
||
|
|
seconds: int = Field(default=2, ge=0, le=10, description="The number of seconds to wait")
|
||
|
|
|
||
|
|
def log(self):
|
||
|
|
return f"I have waited for {self.seconds} seconds"
|
||
|
|
|
||
|
|
|
||
|
|
class RestartAction(BaseModel):
|
||
|
|
"""Restart the task from the beginning."""
|
||
|
|
|
||
|
|
action: Literal["restart"] = "restart"
|
||
|
|
|
||
|
|
def log(self):
|
||
|
|
return "I have restarted the task from the beginning"
|
||
|
|
|
||
|
|
|
||
|
|
class AnswerAction(BaseModel):
|
||
|
|
"""Return a final answer to the task. This is the last action to call in an episode."""
|
||
|
|
|
||
|
|
action: Literal["answer"] = "answer"
|
||
|
|
content: str = Field(description="The answer content")
|
||
|
|
|
||
|
|
def log(self):
|
||
|
|
return f"I have answered the task with '{self.content}'"
|
||
|
|
|
||
|
|
|
||
|
|
ActionSpace = (
|
||
|
|
ClickElementAction
|
||
|
|
| WriteElementAction
|
||
|
|
| ScrollAction
|
||
|
|
| GoBackAction
|
||
|
|
| RefreshAction
|
||
|
|
| WaitAction
|
||
|
|
| RestartAction
|
||
|
|
| AnswerAction
|
||
|
|
| GotoAction
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
class NavigationStep(BaseModel):
|
||
|
|
note: str = Field(
|
||
|
|
default="",
|
||
|
|
description="Task-relevant information extracted from the previous observation. Keep empty if no new info.",
|
||
|
|
)
|
||
|
|
thought: str = Field(description="Reasoning about next steps (<4 lines)")
|
||
|
|
action: ActionSpace = Field(description="Next action to take")
|
||
|
|
|
||
|
|
|
||
|
|
def get_navigation_prompt(task, image, step=1):
|
||
|
|
system_prompt = SYSTEM_PROMPT.format(
|
||
|
|
output_format=NavigationStep.model_json_schema(),
|
||
|
|
timestamp="2025-06-04 14:16:03",
|
||
|
|
)
|
||
|
|
return [
|
||
|
|
{
|
||
|
|
"role": "system",
|
||
|
|
"content": [
|
||
|
|
{"type": "text", "text": system_prompt},
|
||
|
|
],
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"role": "user",
|
||
|
|
"content": [
|
||
|
|
{"type": "text", "text": f"<task>\n{task}\n</task>\n"},
|
||
|
|
{"type": "text", "text": f"<observation step={step}>\n"},
|
||
|
|
{"type": "text", "text": "<screenshot>\n"},
|
||
|
|
{
|
||
|
|
"type": "image",
|
||
|
|
"image": image,
|
||
|
|
},
|
||
|
|
{"type": "text", "text": "\n</screenshot>\n"},
|
||
|
|
{"type": "text", "text": "\n</observation>\n"},
|
||
|
|
],
|
||
|
|
},
|
||
|
|
]
|