Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 13 additions & 9 deletions src/oagi/converters/oagi.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,21 +51,21 @@ def __call__(self, actions: list[Action]) -> list[str]:
"""
converted: list[str] = []
failed: list[tuple[str, str]] = []
has_finish = False
has_terminal = False

if not actions:
return converted

for action in actions:
# Check for duplicate finish() during iteration
is_finish = action.type == ActionType.FINISH
if is_finish:
if has_finish:
# Check for duplicate finish()/fail() during iteration
is_terminal = action.type in (ActionType.FINISH, ActionType.FAIL)
if is_terminal:
if has_terminal:
raise ValueError(
"Duplicate finish() detected. "
"Only one finish() is allowed per action sequence."
"Duplicate finish()/fail() detected. "
"Only one finish() or fail() is allowed per action sequence."
)
has_finish = True
has_terminal = True

try:
converted.extend(self._convert_action(action))
Expand Down Expand Up @@ -172,14 +172,18 @@ def _convert_single_action(self, action: Action) -> list[str]:
self._log_info("Task completion action -> DONE")
return ["DONE"]

if action_type == ActionType.FAIL.value:
self._log_info("Task infeasible action -> FAIL")
return ["FAIL"]

if action_type == ActionType.CALL_USER.value:
self._log_info("User intervention requested")
return []

raise ValueError(
f"Unknown action type: '{action_type}'. "
"Supported: click, left_double, left_triple, right_single, drag, "
"hotkey, type, scroll, wait, finish, call_user"
"hotkey, type, scroll, wait, finish, fail, call_user"
)

def serialize_actions(self, actions: list[Action]) -> list[dict[str, Any]]:
Expand Down
4 changes: 2 additions & 2 deletions src/oagi/handler/pyautogui_action_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,8 +250,8 @@ def _execute_single_action(self, action: Action) -> None:
)
pyautogui.scroll(scroll_amount)

case ActionType.FINISH:
# Task completion - reset handler state
case ActionType.FINISH | ActionType.FAIL:
# Task completion or infeasible - reset handler state
self.reset()

case ActionType.WAIT:
Expand Down
4 changes: 2 additions & 2 deletions src/oagi/handler/ydotool_action_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,8 +164,8 @@ def _execute_action(self, action: Action) -> bool:
text = self.caps_manager.transform_text(text)
self._run_ydotool(["type", text], count=count)

case ActionType.FINISH:
# Task completion - reset handler state
case ActionType.FINISH | ActionType.FAIL:
# Task completion or infeasible - reset handler state
self.reset()

case ActionType.WAIT:
Expand Down
2 changes: 1 addition & 1 deletion src/oagi/server/socketio_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -364,7 +364,7 @@ async def _emit_single_action(
timeout=self.config.socketio_timeout,
)

case ActionType.FINISH:
case ActionType.FINISH | ActionType.FAIL:
return await self.call(
"finish",
FinishEventData(**common).model_dump(),
Expand Down
1 change: 1 addition & 0 deletions src/oagi/types/models/action.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ class ActionType(str, Enum):
TYPE = "type"
SCROLL = "scroll"
FINISH = "finish"
FAIL = "fail"
WAIT = "wait"
CALL_USER = "call_user"

Expand Down
3 changes: 2 additions & 1 deletion src/oagi/utils/output_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def parse_raw_output(raw_output: str) -> Step:
parsed_action = _parse_action(action_text.strip())
if parsed_action:
actions.append(parsed_action)
if parsed_action.type == ActionType.FINISH:
if parsed_action.type in (ActionType.FINISH, ActionType.FAIL):
stop = True

return Step(reason=reason, actions=actions, stop=stop)
Expand Down Expand Up @@ -105,6 +105,7 @@ def _parse_action(action_text: str) -> Action | None:
- scroll(x, y, direction, c) # scroll at position
- wait() # wait for a while
- finish() # indicate task is finished
- fail() # indicate task is infeasible

Args:
action_text: String representation of a single action
Expand Down
1 change: 1 addition & 0 deletions src/oagi/utils/prompt_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
8. scroll(x, y, direction, c) # scroll the mouse at position (x, y) in the direction of up or down for c times, where x and y are integers normalized between 0 and 1000
9. wait() # wait for a while
10. finish() # indicate the task is finished
11. fail() # indicate the task is infeasible

Directly output the text beginning with <|think_start|>, no additional text is needed for this scenario.

Expand Down
16 changes: 16 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,12 @@ def sample_raw_output_completed():
return "<|think_start|>The task has been completed successfully<|think_end|>\n<|action_start|>finish()<|action_end|>"


@pytest.fixture
def sample_raw_output_failed():
"""Sample raw output for infeasible task."""
return "<|think_start|>The task is infeasible<|think_end|>\n<|action_start|>fail()<|action_end|>"


@pytest.fixture
def sample_step(sample_action):
"""Sample Step object for testing."""
Expand All @@ -152,6 +158,16 @@ def completed_step():
)


@pytest.fixture
def failed_step():
"""Sample failed Step object for infeasible task."""
return Step(
reason="The task is infeasible",
actions=[Action(type=ActionType.FAIL, argument="", count=1)],
stop=True,
)


@pytest.fixture
def mock_error_response():
"""Mock error HTTP response."""
Expand Down
21 changes: 21 additions & 0 deletions tests/test_actor.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,27 @@ def test_step_with_completed_response(
assert len(result.actions) == 1
assert result.actions[0].type == ActionType.FINISH

def test_step_with_failed_response(
self, actor, failed_step, sample_usage_obj, mock_upload_file_response
):
actor.task_description = "Test task"
actor.task_id = "task-789"

# Setup mocks
actor.client.put_s3_presigned_url.return_value = mock_upload_file_response
actor.client.chat_completion.return_value = (
failed_step,
"<|think_start|>infeasible<|think_end|>\n<|action_start|>fail()<|action_end|>",
sample_usage_obj,
)

result = actor.step(b"image bytes")

assert result.stop is True
assert result.reason == "The task is infeasible"
assert len(result.actions) == 1
assert result.actions[0].type == ActionType.FAIL

def test_step_handles_exception(self, actor, mock_upload_file_response):
actor.task_description = "Test task"
actor.client.put_s3_presigned_url.return_value = mock_upload_file_response
Expand Down
18 changes: 18 additions & 0 deletions tests/test_oagi_action_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,19 @@ def test_finish_action(self, converter):
result = converter([action])
assert result[0] == "DONE"

def test_fail_action(self, converter):
action = Action(type=ActionType.FAIL, argument="", count=1)
result = converter([action])
assert result[0] == "FAIL"

def test_duplicate_terminal_actions_raises(self, converter):
actions = [
Action(type=ActionType.FINISH, argument="", count=1),
Action(type=ActionType.FAIL, argument="", count=1),
]
with pytest.raises(ValueError, match="Duplicate finish\\(\\)/fail\\(\\)"):
converter(actions)


class TestActionStringToStep:
def test_pyautogui_command(self, converter):
Expand All @@ -114,6 +127,11 @@ def test_done_command(self, converter):
assert step["type"] == "sleep"
assert step["parameters"]["seconds"] == 0

def test_fail_command(self, converter):
step = converter.action_string_to_step("FAIL")
assert step["type"] == "sleep"
assert step["parameters"]["seconds"] == 0


class TestMultipleActions:
def test_action_count(self, converter):
Expand Down
19 changes: 19 additions & 0 deletions tests/test_pyautogui_action_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,11 @@ def test_finish_action(handler, mock_pyautogui):
handler([action])


def test_fail_action(handler, mock_pyautogui):
action = Action(type=ActionType.FAIL, argument="", count=1)
handler([action])


def test_call_user_action(handler, mock_pyautogui, capsys):
action = Action(type=ActionType.CALL_USER, argument="", count=1)
handler([action])
Expand Down Expand Up @@ -453,6 +458,20 @@ def test_finish_action_resets_handler(self, mock_pyautogui):
handler([finish_action])
assert handler.caps_manager.caps_enabled is False

def test_fail_action_resets_handler(self, mock_pyautogui):
config = PyautoguiConfig(capslock_mode="session", post_batch_delay=0)
handler = PyautoguiActionHandler(config=config)

# Enable caps lock
caps_action = Action(type=ActionType.HOTKEY, argument="capslock", count=1)
handler([caps_action])
assert handler.caps_manager.caps_enabled is True

# FAIL action should also reset handler
fail_action = Action(type=ActionType.FAIL, argument="", count=1)
handler([fail_action])
assert handler.caps_manager.caps_enabled is False


class TestAsyncHandlerReset:
def test_async_handler_reset_delegates_to_sync_handler(self, mock_pyautogui):
Expand Down
25 changes: 25 additions & 0 deletions tests/utils/test_output_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,15 @@ def test_parse_finish_action_sets_stop(self):
assert step.actions[0].type == ActionType.FINISH
assert step.stop is True

def test_parse_fail_action_sets_stop(self):
raw = "<|think_start|>Task is infeasible<|think_end|>\n<|action_start|>fail()<|action_end|>"
step = parse_raw_output(raw)

assert step.reason == "Task is infeasible"
assert len(step.actions) == 1
assert step.actions[0].type == ActionType.FAIL
assert step.stop is True

def test_parse_multiple_actions_with_ampersand(self):
raw = "<|think_start|>Do two things<|think_end|>\n<|action_start|>click(100, 200) & type(hello)<|action_end|>"
step = parse_raw_output(raw)
Expand Down Expand Up @@ -147,6 +156,7 @@ class TestParseAction:
("type(hello world)", ActionType.TYPE, "hello world"),
("wait()", ActionType.WAIT, ""),
("finish()", ActionType.FINISH, ""),
("fail()", ActionType.FAIL, ""),
],
)
def test_parse_basic_actions(self, action_text, expected_type, expected_arg):
Expand Down Expand Up @@ -254,3 +264,18 @@ def test_finish_with_other_actions(self):

assert step.stop is True
assert len(step.actions) == 2

def test_fail_with_other_actions(self):
raw = "<|think_start|>Test<|think_end|>\n<|action_start|>click(100, 200) & fail()<|action_end|>"
step = parse_raw_output(raw)

assert step.stop is True
assert len(step.actions) == 2
assert step.actions[1].type == ActionType.FAIL

def test_multiple_fail_actions_stop_true(self):
raw = "<|think_start|>Test<|think_end|>\n<|action_start|>fail() & fail()<|action_end|>"
step = parse_raw_output(raw)

assert step.stop is True
assert len(step.actions) == 2