From 611bea44a3137522a9ff22f799b61a7aa802c10c Mon Sep 17 00:00:00 2001 From: aoshen524 Date: Wed, 4 Feb 2026 23:06:32 +0000 Subject: [PATCH] feat(action): add fail action type for infeasible tasks Add FAIL action type to allow models to signal a task is infeasible: - Add FAIL="fail" to ActionType enum - Set stop=True for fail() in output parser (same as finish()) - Handle FAIL in pyautogui/ydotool handlers (reset state like FINISH) - Handle FAIL in socketio server (emit finish event) - Add fail() to prompt template instruction - Add test coverage for all changes Co-Authored-By: Claude Opus 4.5 --- src/oagi/converters/oagi.py | 22 ++++++++++------- src/oagi/handler/pyautogui_action_handler.py | 4 ++-- src/oagi/handler/ydotool_action_handler.py | 4 ++-- src/oagi/server/socketio_server.py | 2 +- src/oagi/types/models/action.py | 1 + src/oagi/utils/output_parser.py | 3 ++- src/oagi/utils/prompt_builder.py | 1 + tests/conftest.py | 16 +++++++++++++ tests/test_actor.py | 21 ++++++++++++++++ tests/test_oagi_action_converter.py | 18 ++++++++++++++ tests/test_pyautogui_action_handler.py | 19 +++++++++++++++ tests/utils/test_output_parser.py | 25 ++++++++++++++++++++ 12 files changed, 121 insertions(+), 15 deletions(-) diff --git a/src/oagi/converters/oagi.py b/src/oagi/converters/oagi.py index de428a0..9dfefa6 100644 --- a/src/oagi/converters/oagi.py +++ b/src/oagi/converters/oagi.py @@ -51,21 +51,21 @@ def __call__(self, actions: list[Action]) -> list[str]: """ converted: list[str] = [] failed: list[tuple[str, str]] = [] - has_finish = False + has_terminal = False if not actions: return converted for action in actions: - # Check for duplicate finish() during iteration - is_finish = action.type == ActionType.FINISH - if is_finish: - if has_finish: + # Check for duplicate finish()/fail() during iteration + is_terminal = action.type in (ActionType.FINISH, ActionType.FAIL) + if is_terminal: + if has_terminal: raise ValueError( - "Duplicate finish() detected. " - "Only one finish() is allowed per action sequence." + "Duplicate finish()/fail() detected. " + "Only one finish() or fail() is allowed per action sequence." ) - has_finish = True + has_terminal = True try: converted.extend(self._convert_action(action)) @@ -172,6 +172,10 @@ def _convert_single_action(self, action: Action) -> list[str]: self._log_info("Task completion action -> DONE") return ["DONE"] + if action_type == ActionType.FAIL.value: + self._log_info("Task infeasible action -> FAIL") + return ["FAIL"] + if action_type == ActionType.CALL_USER.value: self._log_info("User intervention requested") return [] @@ -179,7 +183,7 @@ def _convert_single_action(self, action: Action) -> list[str]: raise ValueError( f"Unknown action type: '{action_type}'. " "Supported: click, left_double, left_triple, right_single, drag, " - "hotkey, type, scroll, wait, finish, call_user" + "hotkey, type, scroll, wait, finish, fail, call_user" ) def serialize_actions(self, actions: list[Action]) -> list[dict[str, Any]]: diff --git a/src/oagi/handler/pyautogui_action_handler.py b/src/oagi/handler/pyautogui_action_handler.py index c0e1baa..e767637 100644 --- a/src/oagi/handler/pyautogui_action_handler.py +++ b/src/oagi/handler/pyautogui_action_handler.py @@ -250,8 +250,8 @@ def _execute_single_action(self, action: Action) -> None: ) pyautogui.scroll(scroll_amount) - case ActionType.FINISH: - # Task completion - reset handler state + case ActionType.FINISH | ActionType.FAIL: + # Task completion or infeasible - reset handler state self.reset() case ActionType.WAIT: diff --git a/src/oagi/handler/ydotool_action_handler.py b/src/oagi/handler/ydotool_action_handler.py index 354b842..49b8ab0 100644 --- a/src/oagi/handler/ydotool_action_handler.py +++ b/src/oagi/handler/ydotool_action_handler.py @@ -164,8 +164,8 @@ def _execute_action(self, action: Action) -> bool: text = self.caps_manager.transform_text(text) self._run_ydotool(["type", text], count=count) - case ActionType.FINISH: - # Task completion - reset handler state + case ActionType.FINISH | ActionType.FAIL: + # Task completion or infeasible - reset handler state self.reset() case ActionType.WAIT: diff --git a/src/oagi/server/socketio_server.py b/src/oagi/server/socketio_server.py index a6a0b05..89e59bb 100644 --- a/src/oagi/server/socketio_server.py +++ b/src/oagi/server/socketio_server.py @@ -364,7 +364,7 @@ async def _emit_single_action( timeout=self.config.socketio_timeout, ) - case ActionType.FINISH: + case ActionType.FINISH | ActionType.FAIL: return await self.call( "finish", FinishEventData(**common).model_dump(), diff --git a/src/oagi/types/models/action.py b/src/oagi/types/models/action.py index 19b1981..d6fffd3 100644 --- a/src/oagi/types/models/action.py +++ b/src/oagi/types/models/action.py @@ -22,6 +22,7 @@ class ActionType(str, Enum): TYPE = "type" SCROLL = "scroll" FINISH = "finish" + FAIL = "fail" WAIT = "wait" CALL_USER = "call_user" diff --git a/src/oagi/utils/output_parser.py b/src/oagi/utils/output_parser.py index 5207625..86dddf5 100644 --- a/src/oagi/utils/output_parser.py +++ b/src/oagi/utils/output_parser.py @@ -45,7 +45,7 @@ def parse_raw_output(raw_output: str) -> Step: parsed_action = _parse_action(action_text.strip()) if parsed_action: actions.append(parsed_action) - if parsed_action.type == ActionType.FINISH: + if parsed_action.type in (ActionType.FINISH, ActionType.FAIL): stop = True return Step(reason=reason, actions=actions, stop=stop) @@ -105,6 +105,7 @@ def _parse_action(action_text: str) -> Action | None: - scroll(x, y, direction, c) # scroll at position - wait() # wait for a while - finish() # indicate task is finished + - fail() # indicate task is infeasible Args: action_text: String representation of a single action diff --git a/src/oagi/utils/prompt_builder.py b/src/oagi/utils/prompt_builder.py index 2e4c8f9..10cb92d 100644 --- a/src/oagi/utils/prompt_builder.py +++ b/src/oagi/utils/prompt_builder.py @@ -24,6 +24,7 @@ 8. scroll(x, y, direction, c) # scroll the mouse at position (x, y) in the direction of up or down for c times, where x and y are integers normalized between 0 and 1000 9. wait() # wait for a while 10. finish() # indicate the task is finished +11. fail() # indicate the task is infeasible Directly output the text beginning with <|think_start|>, no additional text is needed for this scenario. diff --git a/tests/conftest.py b/tests/conftest.py index 8464a45..0cf00cf 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -132,6 +132,12 @@ def sample_raw_output_completed(): return "<|think_start|>The task has been completed successfully<|think_end|>\n<|action_start|>finish()<|action_end|>" +@pytest.fixture +def sample_raw_output_failed(): + """Sample raw output for infeasible task.""" + return "<|think_start|>The task is infeasible<|think_end|>\n<|action_start|>fail()<|action_end|>" + + @pytest.fixture def sample_step(sample_action): """Sample Step object for testing.""" @@ -152,6 +158,16 @@ def completed_step(): ) +@pytest.fixture +def failed_step(): + """Sample failed Step object for infeasible task.""" + return Step( + reason="The task is infeasible", + actions=[Action(type=ActionType.FAIL, argument="", count=1)], + stop=True, + ) + + @pytest.fixture def mock_error_response(): """Mock error HTTP response.""" diff --git a/tests/test_actor.py b/tests/test_actor.py index 44306ad..e10603c 100644 --- a/tests/test_actor.py +++ b/tests/test_actor.py @@ -219,6 +219,27 @@ def test_step_with_completed_response( assert len(result.actions) == 1 assert result.actions[0].type == ActionType.FINISH + def test_step_with_failed_response( + self, actor, failed_step, sample_usage_obj, mock_upload_file_response + ): + actor.task_description = "Test task" + actor.task_id = "task-789" + + # Setup mocks + actor.client.put_s3_presigned_url.return_value = mock_upload_file_response + actor.client.chat_completion.return_value = ( + failed_step, + "<|think_start|>infeasible<|think_end|>\n<|action_start|>fail()<|action_end|>", + sample_usage_obj, + ) + + result = actor.step(b"image bytes") + + assert result.stop is True + assert result.reason == "The task is infeasible" + assert len(result.actions) == 1 + assert result.actions[0].type == ActionType.FAIL + def test_step_handles_exception(self, actor, mock_upload_file_response): actor.task_description = "Test task" actor.client.put_s3_presigned_url.return_value = mock_upload_file_response diff --git a/tests/test_oagi_action_converter.py b/tests/test_oagi_action_converter.py index e14fb6d..6acafca 100644 --- a/tests/test_oagi_action_converter.py +++ b/tests/test_oagi_action_converter.py @@ -97,6 +97,19 @@ def test_finish_action(self, converter): result = converter([action]) assert result[0] == "DONE" + def test_fail_action(self, converter): + action = Action(type=ActionType.FAIL, argument="", count=1) + result = converter([action]) + assert result[0] == "FAIL" + + def test_duplicate_terminal_actions_raises(self, converter): + actions = [ + Action(type=ActionType.FINISH, argument="", count=1), + Action(type=ActionType.FAIL, argument="", count=1), + ] + with pytest.raises(ValueError, match="Duplicate finish\\(\\)/fail\\(\\)"): + converter(actions) + class TestActionStringToStep: def test_pyautogui_command(self, converter): @@ -114,6 +127,11 @@ def test_done_command(self, converter): assert step["type"] == "sleep" assert step["parameters"]["seconds"] == 0 + def test_fail_command(self, converter): + step = converter.action_string_to_step("FAIL") + assert step["type"] == "sleep" + assert step["parameters"]["seconds"] == 0 + class TestMultipleActions: def test_action_count(self, converter): diff --git a/tests/test_pyautogui_action_handler.py b/tests/test_pyautogui_action_handler.py index bed2ec2..c8041f0 100644 --- a/tests/test_pyautogui_action_handler.py +++ b/tests/test_pyautogui_action_handler.py @@ -131,6 +131,11 @@ def test_finish_action(handler, mock_pyautogui): handler([action]) +def test_fail_action(handler, mock_pyautogui): + action = Action(type=ActionType.FAIL, argument="", count=1) + handler([action]) + + def test_call_user_action(handler, mock_pyautogui, capsys): action = Action(type=ActionType.CALL_USER, argument="", count=1) handler([action]) @@ -453,6 +458,20 @@ def test_finish_action_resets_handler(self, mock_pyautogui): handler([finish_action]) assert handler.caps_manager.caps_enabled is False + def test_fail_action_resets_handler(self, mock_pyautogui): + config = PyautoguiConfig(capslock_mode="session", post_batch_delay=0) + handler = PyautoguiActionHandler(config=config) + + # Enable caps lock + caps_action = Action(type=ActionType.HOTKEY, argument="capslock", count=1) + handler([caps_action]) + assert handler.caps_manager.caps_enabled is True + + # FAIL action should also reset handler + fail_action = Action(type=ActionType.FAIL, argument="", count=1) + handler([fail_action]) + assert handler.caps_manager.caps_enabled is False + class TestAsyncHandlerReset: def test_async_handler_reset_delegates_to_sync_handler(self, mock_pyautogui): diff --git a/tests/utils/test_output_parser.py b/tests/utils/test_output_parser.py index d6f7acd..965d4fe 100644 --- a/tests/utils/test_output_parser.py +++ b/tests/utils/test_output_parser.py @@ -34,6 +34,15 @@ def test_parse_finish_action_sets_stop(self): assert step.actions[0].type == ActionType.FINISH assert step.stop is True + def test_parse_fail_action_sets_stop(self): + raw = "<|think_start|>Task is infeasible<|think_end|>\n<|action_start|>fail()<|action_end|>" + step = parse_raw_output(raw) + + assert step.reason == "Task is infeasible" + assert len(step.actions) == 1 + assert step.actions[0].type == ActionType.FAIL + assert step.stop is True + def test_parse_multiple_actions_with_ampersand(self): raw = "<|think_start|>Do two things<|think_end|>\n<|action_start|>click(100, 200) & type(hello)<|action_end|>" step = parse_raw_output(raw) @@ -147,6 +156,7 @@ class TestParseAction: ("type(hello world)", ActionType.TYPE, "hello world"), ("wait()", ActionType.WAIT, ""), ("finish()", ActionType.FINISH, ""), + ("fail()", ActionType.FAIL, ""), ], ) def test_parse_basic_actions(self, action_text, expected_type, expected_arg): @@ -254,3 +264,18 @@ def test_finish_with_other_actions(self): assert step.stop is True assert len(step.actions) == 2 + + def test_fail_with_other_actions(self): + raw = "<|think_start|>Test<|think_end|>\n<|action_start|>click(100, 200) & fail()<|action_end|>" + step = parse_raw_output(raw) + + assert step.stop is True + assert len(step.actions) == 2 + assert step.actions[1].type == ActionType.FAIL + + def test_multiple_fail_actions_stop_true(self): + raw = "<|think_start|>Test<|think_end|>\n<|action_start|>fail() & fail()<|action_end|>" + step = parse_raw_output(raw) + + assert step.stop is True + assert len(step.actions) == 2