From fac8b2eaf952d91205631b0609ebb222892c533e Mon Sep 17 00:00:00 2001 From: aoshen524 Date: Fri, 30 Jan 2026 02:22:59 +0000 Subject: [PATCH 01/11] feat(converters): add multi-model action converters Add action converters for Claude, Qwen3, and Gemini models to enable remote execution of VLM actions via pyautogui command strings. Key changes: - Add oagi.converters module with BaseActionConverter base class - Add ClaudeActionConverter (XGA 1024x768 coordinate space) - Add Qwen3ActionConverter (0-999 coordinate space) - Add GeminiActionConverter (0-1000 coordinate space) - Add OagiActionConverter (0-1000 coordinate space) - Extract shared utilities to oagi.handler.utils: - CoordinateScaler class for coordinate transformation - KEY_MAP and PYAUTOGUI_VALID_KEYS constants - normalize_key(), parse_hotkey(), validate_keys() functions - Refactor PyautoguiActionHandler to use shared utilities The converters generate pyautogui command strings that can be: 1. Executed locally via PyautoguiActionHandler 2. Sent to remote sandbox via runtime API (action_string_to_step) Co-Authored-By: Claude Opus 4.5 --- src/oagi/__init__.py | 26 + src/oagi/converters/__init__.py | 57 ++ src/oagi/converters/base.py | 295 ++++++++++ src/oagi/converters/claude.py | 193 ++++++ src/oagi/converters/gemini.py | 217 +++++++ src/oagi/converters/models.py | 103 ++++ src/oagi/converters/oagi.py | 197 +++++++ src/oagi/converters/qwen3.py | 194 ++++++ src/oagi/handler/pyautogui_action_handler.py | 59 +- src/oagi/handler/utils.py | 588 +++++++++++++++++++ 10 files changed, 1890 insertions(+), 39 deletions(-) create mode 100644 src/oagi/converters/__init__.py create mode 100644 src/oagi/converters/base.py create mode 100644 src/oagi/converters/claude.py create mode 100644 src/oagi/converters/gemini.py create mode 100644 src/oagi/converters/models.py create mode 100644 src/oagi/converters/oagi.py create mode 100644 src/oagi/converters/qwen3.py diff --git a/src/oagi/__init__.py b/src/oagi/__init__.py index e864bda..07bf68c 100644 --- a/src/oagi/__init__.py +++ b/src/oagi/__init__.py @@ -38,6 +38,16 @@ # Format: name -> (module_path, package_to_check, extra_name) # package_to_check is None if no optional dependency is required _LAZY_IMPORTS_DATA: dict[str, tuple[str, str | None, str | None]] = { + # Action converters (no optional dependencies) + "OagiActionConverter": ("oagi.converters.oagi", None, None), + "ClaudeActionConverter": ("oagi.converters.claude", None, None), + "Qwen3ActionConverter": ("oagi.converters.qwen3", None, None), + "GeminiActionConverter": ("oagi.converters.gemini", None, None), + "ClaudeAction": ("oagi.converters.models", None, None), + "Qwen3Action": ("oagi.converters.models", None, None), + "GeminiAction": ("oagi.converters.models", None, None), + "ConverterConfig": ("oagi.converters.base", None, None), + "BaseActionConverter": ("oagi.converters.base", None, None), # Desktop handlers (require pyautogui/PIL) "AsyncPyautoguiActionHandler": ( "oagi.handler.async_pyautogui_action_handler", @@ -88,6 +98,12 @@ from oagi.agent.default import AsyncDefaultAgent from oagi.agent.observer.agent_observer import AsyncAgentObserver from oagi.agent.tasker import TaskerAgent + from oagi.converters.base import BaseActionConverter, ConverterConfig + from oagi.converters.claude import ClaudeActionConverter + from oagi.converters.gemini import GeminiActionConverter + from oagi.converters.models import ClaudeAction, GeminiAction, Qwen3Action + from oagi.converters.oagi import OagiActionConverter + from oagi.converters.qwen3 import Qwen3ActionConverter from oagi.handler.async_pyautogui_action_handler import AsyncPyautoguiActionHandler from oagi.handler.async_screenshot_maker import AsyncScreenshotMaker from oagi.handler.async_ydotool_action_handler import AsyncYdotoolActionHandler @@ -174,4 +190,14 @@ def __dir__() -> list[str]: "YdotoolConfig", # Lazy imports - Screen manager "ScreenManager", + # Lazy imports - Action converters + "OagiActionConverter", + "ClaudeActionConverter", + "Qwen3ActionConverter", + "GeminiActionConverter", + "ClaudeAction", + "Qwen3Action", + "GeminiAction", + "ConverterConfig", + "BaseActionConverter", ] diff --git a/src/oagi/converters/__init__.py b/src/oagi/converters/__init__.py new file mode 100644 index 0000000..3999591 --- /dev/null +++ b/src/oagi/converters/__init__.py @@ -0,0 +1,57 @@ +# ----------------------------------------------------------------------------- +# Copyright (c) OpenAGI Foundation +# All rights reserved. +# +# This file is part of the official API project. +# Licensed under the MIT License. +# ----------------------------------------------------------------------------- +"""Action converters for multi-model VLM support. + +This module provides converters for different VLM models: +- OAGI: Native OAGI actions (0-1000 normalized coordinates) +- Claude: Claude CUA actions (XGA 1024x768 coordinates) +- Qwen3: Qwen3-VL actions (0-999 normalized coordinates) +- Gemini: Gemini CUA actions (0-1000 normalized coordinates) + +All converters output pyautogui command strings that can be: +1. Executed locally via PyautoguiActionHandler +2. Sent to remote sandbox via runtime API (using action_string_to_step()) + +Example usage: + from oagi.converters import ClaudeActionConverter, ClaudeAction, ConverterConfig + + # Configure for 1920x1080 sandbox + config = ConverterConfig(sandbox_width=1920, sandbox_height=1080) + converter = ClaudeActionConverter(config=config) + + # Convert Claude actions to pyautogui strings + actions = [ClaudeAction(action_type="left_click", coordinate=(512, 384))] + pyautogui_commands = converter(actions) + + # Convert to runtime API steps + for cmd, is_last in pyautogui_commands: + step = converter.action_string_to_step(cmd) + # Execute step via runtime API... +""" + +from oagi.converters.base import BaseActionConverter, ConverterConfig +from oagi.converters.claude import ClaudeActionConverter +from oagi.converters.gemini import GeminiActionConverter +from oagi.converters.models import ClaudeAction, GeminiAction, Qwen3Action +from oagi.converters.oagi import OagiActionConverter +from oagi.converters.qwen3 import Qwen3ActionConverter + +__all__ = [ + # Base + "BaseActionConverter", + "ConverterConfig", + # Converters + "OagiActionConverter", + "ClaudeActionConverter", + "Qwen3ActionConverter", + "GeminiActionConverter", + # Action models + "ClaudeAction", + "Qwen3Action", + "GeminiAction", +] diff --git a/src/oagi/converters/base.py b/src/oagi/converters/base.py new file mode 100644 index 0000000..fdf6ca2 --- /dev/null +++ b/src/oagi/converters/base.py @@ -0,0 +1,295 @@ +# ----------------------------------------------------------------------------- +# Copyright (c) OpenAGI Foundation +# All rights reserved. +# +# This file is part of the official API project. +# Licensed under the MIT License. +# ----------------------------------------------------------------------------- +"""Base class for action converters. + +This module provides the abstract base class for converting model-specific +actions to pyautogui command strings for remote execution. +""" + +import re +from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import Any, Generic, TypeVar + +from oagi.handler.capslock_manager import CapsLockManager +from oagi.handler.utils import ( + CoordinateScaler, + normalize_key, + parse_hotkey, + validate_keys, +) + +T = TypeVar("T") + + +@dataclass +class ConverterConfig: + """Configuration for action converters. + + Matches the configuration options in PyautoguiConfig for consistency. + """ + + sandbox_width: int = 1920 + sandbox_height: int = 1080 + drag_duration: float = 0.5 + scroll_amount: int = 2 + wait_duration: float = 1.0 + hotkey_interval: float = 0.1 + capslock_mode: str = "session" + + +class BaseActionConverter(ABC, Generic[T]): + """Abstract base class for action converters. + + Subclasses must implement: + - coord_width/coord_height properties for input coordinate space + - _convert_single_action() for model-specific conversion logic + - serialize_actions() for trajectory logging + + Provides common functionality: + - Coordinate scaling via CoordinateScaler + - Key normalization via shared utils + - __call__ interface returning [(action_string, is_last), ...] + - action_string_to_step() for runtime API format + """ + + def __init__( + self, + *, + config: ConverterConfig | None = None, + logger: Any | None = None, + ): + """Initialize the converter. + + Args: + config: Converter configuration. Uses defaults if not provided. + logger: Optional logger instance for debug/error logging. + """ + self.config = config or ConverterConfig() + self.logger = logger + + # Initialize coordinate scaler + self._coord_scaler = CoordinateScaler( + source_width=self.coord_width, + source_height=self.coord_height, + target_width=self.config.sandbox_width, + target_height=self.config.sandbox_height, + ) + + # Initialize caps lock manager + self.caps_manager = CapsLockManager(mode=self.config.capslock_mode) + + # Track last cursor position (for actions without explicit coordinates) + self._last_x: int | None = None + self._last_y: int | None = None + + @property + @abstractmethod + def coord_width(self) -> int: + """Input coordinate space width (e.g., 1024 for XGA, 1000 for OAGI).""" + ... + + @property + @abstractmethod + def coord_height(self) -> int: + """Input coordinate space height (e.g., 768 for XGA, 1000 for OAGI).""" + ... + + @property + def scale_x(self) -> float: + """X scaling factor from input to sandbox coordinates.""" + return self._coord_scaler.scale_x + + @property + def scale_y(self) -> float: + """Y scaling factor from input to sandbox coordinates.""" + return self._coord_scaler.scale_y + + def scale_coordinate(self, x: int | float, y: int | float) -> tuple[int, int]: + """Scale coordinates from model space to sandbox space. + + Args: + x: X coordinate in model space + y: Y coordinate in model space + + Returns: + Tuple of (scaled_x, scaled_y) in sandbox space + """ + return self._coord_scaler.scale(x, y) + + def normalize_key(self, key: str) -> str: + """Normalize a key name to pyautogui format. + + Args: + key: Key name to normalize + + Returns: + Normalized key name + """ + return normalize_key(key) + + def parse_hotkey(self, hotkey_str: str, *, validate: bool = True) -> list[str]: + """Parse a hotkey string into a list of normalized key names. + + Args: + hotkey_str: Hotkey string (e.g., "ctrl+c") + validate: If True, validate keys against PYAUTOGUI_VALID_KEYS + + Returns: + List of normalized key names + """ + return parse_hotkey(hotkey_str, validate=validate) + + def validate_keys(self, keys: list[str]) -> None: + """Validate that all keys are recognized by pyautogui. + + Args: + keys: List of key names to validate + + Raises: + ValueError: If any key is invalid + """ + validate_keys(keys) + + def _get_last_or_center(self) -> tuple[int, int]: + """Get last cursor position or screen center as fallback. + + Returns: + Tuple of (x, y) coordinates + """ + if self._last_x is not None and self._last_y is not None: + return self._last_x, self._last_y + return self.config.sandbox_width // 2, self.config.sandbox_height // 2 + + def _log_error(self, message: str) -> None: + """Log an error message if logger is available.""" + if self.logger: + self.logger.error(message) + + def _log_info(self, message: str) -> None: + """Log an info message if logger is available.""" + if self.logger: + self.logger.info(message) + + def _log_debug(self, message: str) -> None: + """Log a debug message if logger is available.""" + if self.logger: + self.logger.debug(message) + + def __call__(self, actions: list[T]) -> list[tuple[str, bool]]: + """Convert actions to list of (action_string, is_last_of_repeat) tuples. + + Args: + actions: List of model-specific action objects + + Returns: + List of tuples: [(action_string, is_last), ...] + - action_string: pyautogui command string + - is_last: True if this is the last action in the batch + + Raises: + RuntimeError: If all action conversions failed + """ + converted: list[tuple[str, bool]] = [] + failed: list[tuple[str, str]] = [] + skipped: list[str] = [] + + if not actions: + return converted + + for i, action in enumerate(actions): + is_last_action = i == len(actions) - 1 + + try: + action_strings = self._convert_single_action(action) + + if not action_strings: + # No-op action (e.g., screenshot, cursor_position) + action_type = getattr(action, "action_type", repr(action)) + skipped.append(str(action_type)) + continue + + for j, action_str in enumerate(action_strings): + is_last = is_last_action and (j == len(action_strings) - 1) + converted.append((action_str, is_last)) + + except Exception as e: + action_repr = repr(action) + self._log_error(f"Failed to convert action: {action_repr}, error: {e}") + failed.append((action_repr, str(e))) + + if skipped: + self._log_debug(f"Skipped no-op actions: {skipped}") + + if not converted and actions and failed: + raise RuntimeError( + f"All action conversions failed ({len(failed)}/{len(actions)}): {failed}" + ) + + return converted + + @abstractmethod + def _convert_single_action(self, action: T) -> list[str]: + """Convert a single action to pyautogui command string(s). + + Args: + action: Model-specific action object + + Returns: + List of pyautogui command strings (may be empty for no-op actions) + + Raises: + ValueError: If action format is invalid + """ + ... + + @abstractmethod + def serialize_actions(self, actions: list[T]) -> list[dict[str, Any]]: + """Serialize actions for trajectory logging. + + Args: + actions: List of model-specific action objects + + Returns: + List of serialized action dictionaries + """ + ... + + def action_string_to_step(self, action: str) -> dict[str, Any]: + """Convert an action string into a step for runtime/do API. + + Args: + action: Action string (e.g., "pyautogui.click(x=100, y=200)") + + Returns: + Step dict for runtime API + """ + action_str = str(action).strip() + + # Special markers + upper = action_str.upper() + if upper in ["DONE", "FAIL"]: + return {"type": "sleep", "parameters": {"seconds": 0}} + + # WAIT(seconds) + wait_match = re.match( + r"^WAIT\((?P[0-9]*\.?[0-9]+)\)$", action_str, re.IGNORECASE + ) + if wait_match: + seconds = float(wait_match.group("sec")) + return {"type": "sleep", "parameters": {"seconds": seconds}} + + # pyautogui code path + if "pyautogui" in action_str.lower(): + return { + "type": "pyautogui", + "parameters": {"code": action_str}, + } + + # Default: shell command + return {"type": "execute", "parameters": {"command": action_str, "shell": True}} diff --git a/src/oagi/converters/claude.py b/src/oagi/converters/claude.py new file mode 100644 index 0000000..bad3259 --- /dev/null +++ b/src/oagi/converters/claude.py @@ -0,0 +1,193 @@ +# ----------------------------------------------------------------------------- +# Copyright (c) OpenAGI Foundation +# All rights reserved. +# +# This file is part of the official API project. +# Licensed under the MIT License. +# ----------------------------------------------------------------------------- +"""Claude action converter. + +This module provides the ClaudeActionConverter for converting Claude CUA +(Computer Use Agent) actions to pyautogui command strings. +""" + +from typing import Any + +from .base import BaseActionConverter +from .models import ClaudeAction + +# Claude uses XGA resolution (1024x768) for coordinate space +XGA_WIDTH = 1024 +XGA_HEIGHT = 768 + + +class ClaudeActionConverter(BaseActionConverter[ClaudeAction]): + """Convert Claude CUA actions to pyautogui command strings. + + This converter handles: + 1. Coordinate scaling from XGA (1024x768) to sandbox dimensions (1920x1080) + 2. Action format conversion from Claude format to pyautogui strings + 3. Key name normalization for hotkey combinations + + The output can be converted to runtime API steps via action_string_to_step(). + """ + + @property + def coord_width(self) -> int: + return XGA_WIDTH + + @property + def coord_height(self) -> int: + return XGA_HEIGHT + + def _parse_claude_hotkey(self, text: str) -> list[str]: + """Parse Claude hotkey string into list of normalized keys. + + Claude uses "-" or "+" as separators. + """ + text = text.replace("-", "+") + keys = [self.normalize_key(k) for k in text.split("+") if k.strip()] + return keys + + def _get_coords_or_last(self, action: ClaudeAction) -> tuple[int, int]: + """Get scaled coordinates from action or fall back to last position.""" + if action.coordinate is not None: + x, y = self.scale_coordinate(*action.coordinate) + self._last_x, self._last_y = x, y + return x, y + elif self._last_x is not None and self._last_y is not None: + return self._last_x, self._last_y + else: + return self._get_last_or_center() + + def _convert_single_action(self, action: ClaudeAction) -> list[str]: + """Convert a single Claude action to pyautogui command string(s).""" + action_type = action.action_type.lower() + + if action_type == "screenshot": + return [] # No-op + + if action_type == "mouse_move": + if action.coordinate is None: + raise ValueError("coordinate is required for mouse_move") + x, y = self.scale_coordinate(*action.coordinate) + self._last_x, self._last_y = x, y + return [f"pyautogui.moveTo({x}, {y})"] + + if action_type == "left_click": + x, y = self._get_coords_or_last(action) + return [f"pyautogui.click(x={x}, y={y})"] + + if action_type == "double_click": + x, y = self._get_coords_or_last(action) + return [f"pyautogui.doubleClick(x={x}, y={y})"] + + if action_type == "triple_click": + x, y = self._get_coords_or_last(action) + return [f"pyautogui.tripleClick(x={x}, y={y})"] + + if action_type == "right_click": + x, y = self._get_coords_or_last(action) + return [f"pyautogui.rightClick(x={x}, y={y})"] + + if action_type == "middle_click": + x, y = self._get_coords_or_last(action) + return [f"pyautogui.click(x={x}, y={y}, button='middle')"] + + if action_type == "left_click_drag": + # Start from start_coordinate or last position + if action.start_coordinate is not None: + sx, sy = self.scale_coordinate(*action.start_coordinate) + elif self._last_x is not None and self._last_y is not None: + sx, sy = self._last_x, self._last_y + else: + sx, sy = self._get_last_or_center() + + # End at coordinate + if action.coordinate is None: + raise ValueError( + "coordinate (end position) is required for left_click_drag" + ) + ex, ey = self.scale_coordinate(*action.coordinate) + self._last_x, self._last_y = ex, ey + + return [ + f"pyautogui.moveTo({sx}, {sy})", + f"pyautogui.dragTo({ex}, {ey}, duration={self.config.drag_duration})", + ] + + if action_type == "type": + if action.text is None: + raise ValueError("text is required for type action") + text = action.text.replace("\\", "\\\\").replace("'", "\\'") + return [f"pyautogui.typewrite('{text}')"] + + if action_type == "key": + if action.text is None: + raise ValueError("text is required for key action") + keys = self._parse_claude_hotkey(action.text) + if not keys: + raise ValueError(f"Invalid key combination: {action.text}") + keys_str = ", ".join(repr(k) for k in keys) + return [ + f"pyautogui.hotkey({keys_str}, interval={self.config.hotkey_interval})" + ] + + if action_type == "scroll": + if action.coordinate is None: + raise ValueError("coordinate is required for scroll action") + x, y = self.scale_coordinate(*action.coordinate) + + direction = (action.scroll_direction or "down").strip().lower() + amount = ( + action.scroll_amount + if action.scroll_amount is not None + else self.config.scroll_amount + ) + + if direction == "up": + scroll_val = amount + elif direction == "down": + scroll_val = -amount + else: + raise ValueError(f"Invalid scroll direction: {direction}") + + return [ + f"pyautogui.moveTo({x}, {y})", + f"pyautogui.scroll({scroll_val})", + ] + + if action_type == "wait": + duration = ( + action.duration + if action.duration is not None + else self.config.wait_duration + ) + return [f"WAIT({duration})"] + + if action_type == "cursor_position": + return [] # No-op + + self._log_debug(f"Unknown Claude action type: {action_type}") + return [] + + def serialize_actions(self, actions: list[ClaudeAction]) -> list[dict[str, Any]]: + """Serialize Claude actions for trajectory logging.""" + serialized = [] + for action in actions or []: + serialized.append( + { + "type": action.action_type, + "coordinate": list(action.coordinate) + if action.coordinate + else None, + "start_coordinate": list(action.start_coordinate) + if action.start_coordinate + else None, + "text": action.text, + "scroll_direction": action.scroll_direction, + "scroll_amount": action.scroll_amount, + "duration": action.duration, + } + ) + return serialized diff --git a/src/oagi/converters/gemini.py b/src/oagi/converters/gemini.py new file mode 100644 index 0000000..0386b04 --- /dev/null +++ b/src/oagi/converters/gemini.py @@ -0,0 +1,217 @@ +# ----------------------------------------------------------------------------- +# Copyright (c) OpenAGI Foundation +# All rights reserved. +# +# This file is part of the official API project. +# Licensed under the MIT License. +# ----------------------------------------------------------------------------- +"""Gemini action converter. + +This module provides the GeminiActionConverter for converting Gemini CUA +actions to pyautogui command strings. +""" + +from typing import Any + +from .base import BaseActionConverter +from .models import GeminiAction + +# Gemini uses 0-1000 coordinate space +GEMINI_COORD_SIZE = 1000 + + +class GeminiActionConverter(BaseActionConverter[GeminiAction]): + """Convert Gemini CUA actions to pyautogui command strings. + + This converter handles: + 1. Coordinate scaling from 0-1000 space to sandbox dimensions (1920x1080) + 2. Action format conversion from Gemini format to pyautogui strings + 3. High-level actions like navigate, search, go_back + 4. Key name normalization for hotkey combinations + + The output can be converted to runtime API steps via action_string_to_step(). + """ + + @property + def coord_width(self) -> int: + return GEMINI_COORD_SIZE + + @property + def coord_height(self) -> int: + return GEMINI_COORD_SIZE + + def _parse_gemini_hotkey(self, keys_str: str) -> list[str]: + """Parse Gemini hotkey string into list of normalized keys.""" + keys_str = keys_str.replace("-", "+") + keys = [self.normalize_key(k) for k in keys_str.split("+") if k.strip()] + return keys + + def _convert_single_action(self, action: GeminiAction) -> list[str]: + """Convert a single Gemini action to pyautogui command string(s).""" + action_type = action.action_type.lower() + hotkey_interval = self.config.hotkey_interval + + if action_type == "open_web_browser": + return [] # No-op + + if action_type == "click_at": + if action.x is None or action.y is None: + raise ValueError("x and y are required for click_at") + x, y = self.scale_coordinate(action.x, action.y) + self._last_x, self._last_y = x, y + return [f"pyautogui.click(x={x}, y={y})"] + + if action_type == "hover_at": + if action.x is None or action.y is None: + raise ValueError("x and y are required for hover_at") + x, y = self.scale_coordinate(action.x, action.y) + self._last_x, self._last_y = x, y + return [f"pyautogui.moveTo({x}, {y})"] + + if action_type == "type_text_at": + if action.x is None or action.y is None: + raise ValueError("x and y are required for type_text_at") + if action.text is None: + raise ValueError("text is required for type_text_at") + + x, y = self.scale_coordinate(action.x, action.y) + self._last_x, self._last_y = x, y + + commands = [f"pyautogui.click(x={x}, y={y})"] + + if action.clear_before_typing: + commands.append( + f"pyautogui.hotkey('ctrl', 'a', interval={hotkey_interval})" + ) + commands.append("pyautogui.press('delete')") + + text = action.text.replace("\\", "\\\\").replace("'", "\\'") + commands.append(f"pyautogui.typewrite('{text}')") + + if action.press_enter: + commands.append("pyautogui.press('enter')") + + return commands + + if action_type == "scroll_document": + direction = (action.direction or "down").strip().lower() + + if direction == "down": + return ["pyautogui.press('pagedown')"] + elif direction == "up": + return ["pyautogui.press('pageup')"] + elif direction == "left": + return ["pyautogui.press('left')"] + elif direction == "right": + return ["pyautogui.press('right')"] + else: + raise ValueError(f"Invalid scroll direction: {direction}") + + if action_type == "scroll_at": + if action.x is None or action.y is None: + raise ValueError("x and y are required for scroll_at") + + x, y = self.scale_coordinate(action.x, action.y) + direction = (action.direction or "down").strip().lower() + + amount = self.config.scroll_amount + if action.magnitude is not None: + amount = max(1, action.magnitude // 100) + + if direction == "up": + scroll_val = amount + elif direction == "down": + scroll_val = -amount + else: + self._log_debug( + f"Unsupported scroll direction '{direction}', defaulting to down" + ) + scroll_val = -amount + + return [ + f"pyautogui.moveTo({x}, {y})", + f"pyautogui.scroll({scroll_val})", + ] + + if action_type == "wait_5_seconds": + return ["WAIT(5)"] + + if action_type == "go_back": + return [f"pyautogui.hotkey('alt', 'left', interval={hotkey_interval})"] + + if action_type == "go_forward": + return [f"pyautogui.hotkey('alt', 'right', interval={hotkey_interval})"] + + if action_type == "search": + return [ + f"pyautogui.hotkey('ctrl', 'l', interval={hotkey_interval})", + "pyautogui.typewrite('https://www.google.com')", + "pyautogui.press('enter')", + ] + + if action_type == "navigate": + if action.url is None: + raise ValueError("url is required for navigate action") + url = action.url + if not url.startswith(("http://", "https://")): + url = "https://" + url + url = url.replace("'", "\\'") + return [ + f"pyautogui.hotkey('ctrl', 'l', interval={hotkey_interval})", + f"pyautogui.hotkey('ctrl', 'a', interval={hotkey_interval})", + f"pyautogui.typewrite('{url}')", + "pyautogui.press('enter')", + ] + + if action_type == "key_combination": + if action.keys is None: + raise ValueError("keys is required for key_combination action") + keys = self._parse_gemini_hotkey(action.keys) + if not keys: + raise ValueError(f"Invalid key combination: {action.keys}") + keys_str = ", ".join(repr(k) for k in keys) + return [f"pyautogui.hotkey({keys_str}, interval={hotkey_interval})"] + + if action_type == "drag_and_drop": + if action.x is None or action.y is None: + raise ValueError( + "x and y (start position) are required for drag_and_drop" + ) + if action.destination_x is None or action.destination_y is None: + raise ValueError( + "destination_x and destination_y are required for drag_and_drop" + ) + + sx, sy = self.scale_coordinate(action.x, action.y) + ex, ey = self.scale_coordinate(action.destination_x, action.destination_y) + self._last_x, self._last_y = ex, ey + + return [ + f"pyautogui.moveTo({sx}, {sy})", + f"pyautogui.dragTo({ex}, {ey}, duration={self.config.drag_duration})", + ] + + self._log_debug(f"Unknown Gemini action type: {action_type}") + return [] + + def serialize_actions(self, actions: list[GeminiAction]) -> list[dict[str, Any]]: + """Serialize Gemini actions for trajectory logging.""" + serialized = [] + for action in actions or []: + serialized.append( + { + "type": action.action_type, + "x": action.x, + "y": action.y, + "text": action.text, + "press_enter": action.press_enter, + "clear_before_typing": action.clear_before_typing, + "direction": action.direction, + "magnitude": action.magnitude, + "destination_x": action.destination_x, + "destination_y": action.destination_y, + "keys": action.keys, + "url": action.url, + } + ) + return serialized diff --git a/src/oagi/converters/models.py b/src/oagi/converters/models.py new file mode 100644 index 0000000..b06cb73 --- /dev/null +++ b/src/oagi/converters/models.py @@ -0,0 +1,103 @@ +# ----------------------------------------------------------------------------- +# Copyright (c) OpenAGI Foundation +# All rights reserved. +# +# This file is part of the official API project. +# Licensed under the MIT License. +# ----------------------------------------------------------------------------- +"""Action dataclasses for model-specific action converters. + +This module provides dataclass definitions for actions from different VLM models: +- ClaudeAction: Claude CUA actions (XGA 1024x768 coordinates) +- Qwen3Action: Qwen3-VL actions (0-999 normalized coordinates) +- GeminiAction: Gemini actions (0-1000 normalized coordinates) + +Note: OAGI actions use the existing oagi.types.Action model. +""" + +from dataclasses import dataclass + + +@dataclass +class ClaudeAction: + """Represents a Claude computer use action. + + Claude uses XGA coordinates (1024x768) for coordinate actions. + + Attributes: + action_type: The type of action (e.g., "left_click", "type", "key") + coordinate: XGA coordinates (x, y) where x in [0,1024] and y in [0,768] + text: Text content for type or key actions + scroll_direction: Direction for scroll ("up" or "down") + scroll_amount: Amount to scroll (optional, uses default if not specified) + duration: Duration in milliseconds for wait actions + start_coordinate: Starting coordinate for drag operations + """ + + action_type: str + coordinate: tuple[int, int] | None = None + text: str | None = None + scroll_direction: str | None = None + scroll_amount: int | None = None + duration: int | None = None + start_coordinate: tuple[int, int] | None = None + + +@dataclass +class Qwen3Action: + """Represents a Qwen3 computer use action. + + Qwen3 uses normalized coordinates (0-999) for coordinate actions. + + Attributes: + action_type: The type of action (e.g., "left_click", "type", "key") + coordinate: Normalized coordinates (x, y) where both x and y in [0,999] + text: Text content for type and answer actions + keys: List of key names for key/hotkey actions + pixels: Pixel amount for scroll actions + time: Duration in seconds for wait actions + status: Status string for terminate actions ("success" or "failure") + """ + + action_type: str + coordinate: tuple[int, int] | None = None + text: str | None = None + keys: list[str] | None = None + pixels: int | None = None + time: float | None = None + status: str | None = None + + +@dataclass +class GeminiAction: + """Represents a Gemini computer use action. + + Gemini uses normalized coordinates (0-1000) for coordinate actions. + + Attributes: + action_type: The type of action (e.g., "click_at", "type_text_at", "scroll_at") + x: X coordinate (0-1000) + y: Y coordinate (0-1000) + text: Text content for typing actions + press_enter: Whether to press Enter after typing + clear_before_typing: Whether to clear existing text before typing + direction: Scroll direction ("up", "down", "left", "right") + magnitude: Scroll magnitude in pixels + destination_x: Destination X coordinate for drag operations + destination_y: Destination Y coordinate for drag operations + keys: Key combination string for key actions (e.g., "ctrl+c") + url: URL for navigation actions + """ + + action_type: str + x: int | None = None + y: int | None = None + text: str | None = None + press_enter: bool | None = None + clear_before_typing: bool | None = None + direction: str | None = None + magnitude: int | None = None + destination_x: int | None = None + destination_y: int | None = None + keys: str | None = None + url: str | None = None diff --git a/src/oagi/converters/oagi.py b/src/oagi/converters/oagi.py new file mode 100644 index 0000000..f081fe8 --- /dev/null +++ b/src/oagi/converters/oagi.py @@ -0,0 +1,197 @@ +# ----------------------------------------------------------------------------- +# Copyright (c) OpenAGI Foundation +# All rights reserved. +# +# This file is part of the official API project. +# Licensed under the MIT License. +# ----------------------------------------------------------------------------- +"""OAGI action converter. + +This module provides the OagiActionConverter for converting OAGI actions +to pyautogui command strings for remote execution. +""" + +from typing import Any + +from oagi.handler.utils import ( + parse_click_coords, + parse_drag_coords, + parse_scroll_coords, +) +from oagi.types import Action, ActionType + +from .base import BaseActionConverter + +# OAGI uses normalized 0-1000 coordinate space +OAGI_COORD_SIZE = 1000 + + +class OagiActionConverter(BaseActionConverter[Action]): + """Convert OAGI actions to pyautogui command strings. + + This converter handles: + 1. Coordinate scaling from 0-1000 space to sandbox dimensions (1920x1080) + 2. Action format conversion from OAGI Action format to pyautogui strings + 3. Key name normalization for hotkey combinations + + The output can be converted to runtime API steps via action_string_to_step(). + """ + + @property + def coord_width(self) -> int: + return OAGI_COORD_SIZE + + @property + def coord_height(self) -> int: + return OAGI_COORD_SIZE + + def __call__(self, actions: list[Action]) -> list[tuple[str, bool]]: + """Convert OAGI actions to list of (action_string, is_last) tuples. + + Extends base implementation to handle action count and finish detection. + """ + converted: list[tuple[str, bool]] = [] + failed: list[tuple[str, str]] = [] + has_finish = False + + if not actions: + return converted + + for action in actions: + # Check for duplicate finish() during iteration + is_finish = action.type == ActionType.FINISH + if is_finish: + if has_finish: + raise ValueError( + "Duplicate finish() detected. " + "Only one finish() is allowed per action sequence." + ) + has_finish = True + + try: + converted.extend(self._convert_action(action)) + except Exception as e: + action_repr = f"{action.type.value}({action.argument})" + self._log_error(f"Failed to convert action: {action_repr}, error: {e}") + failed.append((action_repr, str(e))) + + if not converted and actions and failed: + raise RuntimeError( + f"All action conversions failed ({len(failed)}/{len(actions)}): {failed}" + ) + return converted + + def _convert_action(self, action: Action) -> list[tuple[str, bool]]: + """Convert action to list of (action_string, is_last_of_repeat) tuples. + + Handles action.count for repeat support. + """ + count = action.count or 1 + out: list[tuple[str, bool]] = [] + single_actions = self._convert_single_action(action) + + # Repeat the actions count times + for i in range(int(count)): + is_last_repeat = i == int(count) - 1 + for j, action_str in enumerate(single_actions): + is_last = is_last_repeat and (j == len(single_actions) - 1) + out.append((action_str, is_last)) + + return out + + def _convert_single_action(self, action: Action) -> list[str]: + """Convert a single OAGI action to pyautogui command string(s).""" + action_type = action.type.value + argument = (action.argument or "").strip("()") + + drag_duration = self.config.drag_duration + scroll_amount = self.config.scroll_amount + wait_duration = self.config.wait_duration + hotkey_interval = self.config.hotkey_interval + + if action_type == ActionType.CLICK.value: + x, y = parse_click_coords(argument, self._coord_scaler) + return [f"pyautogui.click(x={x}, y={y})"] + + if action_type == ActionType.LEFT_DOUBLE.value: + x, y = parse_click_coords(argument, self._coord_scaler) + return [f"pyautogui.doubleClick(x={x}, y={y})"] + + if action_type == ActionType.LEFT_TRIPLE.value: + x, y = parse_click_coords(argument, self._coord_scaler) + return [f"pyautogui.tripleClick(x={x}, y={y})"] + + if action_type == ActionType.RIGHT_SINGLE.value: + x, y = parse_click_coords(argument, self._coord_scaler) + return [f"pyautogui.rightClick(x={x}, y={y})"] + + if action_type == ActionType.DRAG.value: + sx, sy, ex, ey = parse_drag_coords(argument, self._coord_scaler) + return [ + f"pyautogui.moveTo({sx}, {sy})", + f"pyautogui.dragTo({ex}, {ey}, duration={drag_duration})", + ] + + if action_type == ActionType.HOTKEY.value: + keys = self.parse_hotkey(argument, validate=True) + valid_keys = [k for k in keys if k] + if not valid_keys: + raise ValueError( + f"Invalid hotkey format: '{argument}'. " + "Expected key names like 'ctrl+c', 'alt+tab'" + ) + # Check if this is a caps lock key press + if len(valid_keys) == 1 and valid_keys[0] == "capslock": + if self.caps_manager.should_use_system_capslock(): + return [f"pyautogui.hotkey('capslock', interval={hotkey_interval})"] + else: + self.caps_manager.toggle() + return [] # No pyautogui command for session mode + else: + keys_str = ", ".join(repr(k) for k in valid_keys) + return [f"pyautogui.hotkey({keys_str}, interval={hotkey_interval})"] + + if action_type == ActionType.TYPE.value: + text = argument.strip("\"'") + text = self.caps_manager.transform_text(text) + return [f"pyautogui.typewrite({text!r})"] + + if action_type == ActionType.SCROLL.value: + x, y, direction = parse_scroll_coords(argument, self._coord_scaler) + amount = scroll_amount if direction == "up" else -scroll_amount + return [f"pyautogui.moveTo({x}, {y})", f"pyautogui.scroll({amount})"] + + if action_type == ActionType.WAIT.value: + try: + seconds = float(argument) if argument else wait_duration + except ValueError: + raise ValueError( + f"Invalid wait duration: '{argument}'. " + "Expected numeric value in seconds." + ) + return [f"WAIT({seconds})"] + + if action_type == ActionType.FINISH.value: + self._log_info("Task completion action -> DONE") + return ["DONE"] + + if action_type == ActionType.CALL_USER.value: + self._log_info("User intervention requested") + return [] + + raise ValueError( + f"Unknown action type: '{action_type}'. " + "Supported: click, left_double, left_triple, right_single, drag, " + "hotkey, type, scroll, wait, finish, call_user" + ) + + def serialize_actions(self, actions: list[Action]) -> list[dict[str, Any]]: + """Serialize OAGI actions for trajectory logging.""" + return [ + { + "type": action.type.value, + "argument": action.argument, + "count": action.count, + } + for action in (actions or []) + ] diff --git a/src/oagi/converters/qwen3.py b/src/oagi/converters/qwen3.py new file mode 100644 index 0000000..94a3b19 --- /dev/null +++ b/src/oagi/converters/qwen3.py @@ -0,0 +1,194 @@ +# ----------------------------------------------------------------------------- +# Copyright (c) OpenAGI Foundation +# All rights reserved. +# +# This file is part of the official API project. +# Licensed under the MIT License. +# ----------------------------------------------------------------------------- +"""Qwen3 action converter. + +This module provides the Qwen3ActionConverter for converting Qwen3-VL +actions to pyautogui command strings. +""" + +from typing import Any + +from .base import BaseActionConverter, ConverterConfig +from .models import Qwen3Action + +# Qwen3 uses normalized 0-999 coordinate space +QWEN3_COORD_SIZE = 999 + + +class Qwen3ActionConverter(BaseActionConverter[Qwen3Action]): + """Convert Qwen3 CUA actions to pyautogui command strings. + + This converter handles: + 1. Coordinate scaling from 0-999 space to sandbox dimensions (1920x1080) + 2. Action format conversion from Qwen3 format to pyautogui strings + 3. Key name normalization for hotkey combinations + 4. Cursor position tracking for scroll and drag actions + + The output can be converted to runtime API steps via action_string_to_step(). + """ + + def __init__( + self, + *, + config: ConverterConfig | None = None, + logger: Any | None = None, + ): + """Initialize the Qwen3 converter. + + Qwen3 starts with cursor at screen center by default. + """ + super().__init__(config=config, logger=logger) + # Qwen3 starts cursor at center + self._last_x = self.config.sandbox_width // 2 + self._last_y = self.config.sandbox_height // 2 + + @property + def coord_width(self) -> int: + return QWEN3_COORD_SIZE + + @property + def coord_height(self) -> int: + return QWEN3_COORD_SIZE + + def _get_coords_from_action(self, action: Qwen3Action) -> tuple[int, int]: + """Extract and scale coordinates from action, falling back to last position.""" + if action.coordinate is not None and len(action.coordinate) >= 2: + x, y = action.coordinate[:2] + scaled_x, scaled_y = self.scale_coordinate(int(x), int(y)) + self._last_x, self._last_y = scaled_x, scaled_y + return scaled_x, scaled_y + else: + return self._last_x, self._last_y + + def _convert_single_action(self, action: Qwen3Action) -> list[str]: + """Convert a single Qwen3 action to pyautogui command string(s).""" + action_type = action.action_type.lower() + + if action_type == "mouse_move": + x, y = self._get_coords_from_action(action) + return [f"pyautogui.moveTo({x}, {y})"] + + if action_type == "left_click": + x, y = self._get_coords_from_action(action) + return [f"pyautogui.click(x={x}, y={y})"] + + if action_type == "double_click": + x, y = self._get_coords_from_action(action) + return [f"pyautogui.doubleClick(x={x}, y={y})"] + + if action_type == "triple_click": + x, y = self._get_coords_from_action(action) + return [f"pyautogui.tripleClick(x={x}, y={y})"] + + if action_type == "right_click": + x, y = self._get_coords_from_action(action) + return [f"pyautogui.rightClick(x={x}, y={y})"] + + if action_type == "middle_click": + x, y = self._get_coords_from_action(action) + return [f"pyautogui.click(x={x}, y={y}, button='middle')"] + + if action_type == "left_click_drag": + sx, sy = self._last_x, self._last_y + + if action.coordinate is None or len(action.coordinate) < 2: + raise ValueError( + "coordinate (end position) is required for left_click_drag" + ) + + ex, ey = self.scale_coordinate( + int(action.coordinate[0]), int(action.coordinate[1]) + ) + self._last_x, self._last_y = ex, ey + + return [ + f"pyautogui.moveTo({sx}, {sy})", + f"pyautogui.dragTo({ex}, {ey}, duration={self.config.drag_duration})", + ] + + if action_type == "type": + if action.text is None: + raise ValueError("text is required for type action") + text = action.text.replace("\\", "\\\\").replace("'", "\\'") + return [f"pyautogui.typewrite('{text}')"] + + if action_type == "key": + if action.keys is None or len(action.keys) == 0: + raise ValueError("keys array is required for key action") + + keys = [self.normalize_key(k) for k in action.keys] + keys = [k for k in keys if k] + + if not keys: + raise ValueError(f"Invalid key combination: {action.keys}") + + keys_str = ", ".join(repr(k) for k in keys) + return [ + f"pyautogui.hotkey({keys_str}, interval={self.config.hotkey_interval})" + ] + + if action_type in ("scroll", "hscroll"): + x, y = self._get_coords_from_action(action) + + pixels = action.pixels if action.pixels is not None else 0 + if pixels >= 0: + scroll_val = self.config.scroll_amount + else: + scroll_val = -self.config.scroll_amount + + return [ + f"pyautogui.moveTo({x}, {y})", + f"pyautogui.scroll({scroll_val})", + ] + + if action_type == "wait": + duration = ( + action.time if action.time is not None else self.config.wait_duration + ) + return [f"WAIT({duration})"] + + if action_type == "terminate": + status = action.status or "success" + self._log_info(f"Task terminated with status: {status}") + return ["DONE"] + + if action_type == "answer": + answer_text = action.text or "" + self._log_info(f"Model answer: {answer_text}") + return [] # No-op + + self._log_debug(f"Unknown Qwen3 action type: {action_type}") + return [] + + def serialize_actions(self, actions: list[Qwen3Action]) -> list[dict[str, Any]]: + """Serialize Qwen3 actions for trajectory logging.""" + serialized = [] + for action in actions or []: + serialized.append( + { + "type": action.action_type, + "coordinate": list(action.coordinate) + if action.coordinate + else None, + "text": action.text, + "keys": action.keys, + "pixels": action.pixels, + "time": action.time, + "status": action.status, + } + ) + return serialized + + def update_cursor(self, x: int, y: int) -> None: + """Update the cursor position after action execution.""" + self._last_x = x + self._last_y = y + + def get_cursor(self) -> tuple[int, int]: + """Get current cursor position in sandbox coordinates.""" + return self._last_x, self._last_y diff --git a/src/oagi/handler/pyautogui_action_handler.py b/src/oagi/handler/pyautogui_action_handler.py index 4a2853e..c0e1baa 100644 --- a/src/oagi/handler/pyautogui_action_handler.py +++ b/src/oagi/handler/pyautogui_action_handler.py @@ -17,6 +17,7 @@ from ..exceptions import check_optional_dependency from ..types import Action, ActionType, parse_coords, parse_drag_coords, parse_scroll from .capslock_manager import CapsLockManager +from .utils import CoordinateScaler, normalize_key, parse_hotkey check_optional_dependency("pyautogui", "PyautoguiActionHandler", "desktop") import pyautogui # noqa: E402 @@ -92,6 +93,15 @@ def __init__(self, config: PyautoguiConfig | None = None): self.caps_manager = CapsLockManager(mode=self.config.capslock_mode) # The origin position of coordinates (the top-left corner of the target screen) self.origin_x, self.origin_y = 0, 0 + # Initialize coordinate scaler (OAGI uses 0-1000 normalized coordinates) + self._coord_scaler = CoordinateScaler( + source_width=1000, + source_height=1000, + target_width=self.screen_width, + target_height=self.screen_height, + origin_x=self.origin_x, + origin_y=self.origin_y, + ) def reset(self): """Reset handler state. @@ -109,6 +119,9 @@ def set_target_screen(self, screen: Screen) -> None: """ self.screen_width, self.screen_height = screen.width, screen.height self.origin_x, self.origin_y = screen.x, screen.y + # Update coordinate scaler + self._coord_scaler.set_target_size(screen.width, screen.height) + self._coord_scaler.set_origin(screen.x, screen.y) def _denormalize_coords(self, x: float, y: float) -> tuple[int, int]: """Convert coordinates from 0-1000 range to actual screen coordinates. @@ -116,26 +129,7 @@ def _denormalize_coords(self, x: float, y: float) -> tuple[int, int]: Also handles corner coordinates to prevent PyAutoGUI fail-safe trigger. Corner coordinates (0,0), (0,max), (max,0), (max,max) are offset by 1 pixel. """ - screen_x = int(x * self.screen_width / 1000) - screen_y = int(y * self.screen_height / 1000) - - # Prevent fail-safe by adjusting corner coordinates - # Check if coordinates are at screen corners (with small tolerance) - if screen_x < 1: - screen_x = 1 - elif screen_x > self.screen_width - 1: - screen_x = self.screen_width - 1 - - if screen_y < 1: - screen_y = 1 - elif screen_y > self.screen_height - 1: - screen_y = self.screen_height - 1 - - # Add origin offset to convert relative to top-left corner - screen_x += self.origin_x - screen_y += self.origin_y - - return screen_x, screen_y + return self._coord_scaler.scale(x, y, prevent_failsafe=True) def _parse_coords(self, args_str: str) -> tuple[int, int]: """Extract x, y coordinates from argument string.""" @@ -163,28 +157,15 @@ def _parse_scroll(self, args_str: str) -> tuple[int, int, str]: def _normalize_key(self, key: str) -> str: """Normalize key names for consistency.""" - key = key.strip().lower() - # Normalize caps lock variations - hotkey_variations_mapping = { - "capslock": ["caps_lock", "caps", "capslock"], - "pgup": ["page_up", "pageup"], - "pgdn": ["page_down", "pagedown"], - } - for normalized, variations in hotkey_variations_mapping.items(): - if key in variations: - return normalized - # Remap ctrl to command on macOS if enabled - if self.config.macos_ctrl_to_cmd and sys.platform == "darwin" and key == "ctrl": - return "command" - return key + return normalize_key(key, macos_ctrl_to_cmd=self.config.macos_ctrl_to_cmd) def _parse_hotkey(self, args_str: str) -> list[str]: """Parse hotkey string into list of keys.""" - # Remove parentheses if present - args_str = args_str.strip("()") - # Split by '+' to get individual keys - keys = [self._normalize_key(key) for key in args_str.split("+")] - return keys + return parse_hotkey( + args_str, + macos_ctrl_to_cmd=self.config.macos_ctrl_to_cmd, + validate=False, # Don't validate, let pyautogui handle invalid keys + ) def _move_and_wait(self, x: int, y: int) -> None: """Move cursor to position and wait before clicking.""" diff --git a/src/oagi/handler/utils.py b/src/oagi/handler/utils.py index 8db1604..ff36b02 100644 --- a/src/oagi/handler/utils.py +++ b/src/oagi/handler/utils.py @@ -5,6 +5,594 @@ # This file is part of the official API project. # Licensed under the MIT License. # ----------------------------------------------------------------------------- +"""Shared utilities for action handling and conversion. + +This module provides common functionality used by both PyautoguiActionHandler +(for local execution) and action converters (for remote execution). +""" + +import sys + +# ============================================================================= +# Key Normalization Mapping +# ============================================================================= + +# Unified key mapping - normalizes various key name formats to pyautogui names +KEY_MAP: dict[str, str] = { + # Modifier keys + "ctrl": "ctrl", + "control": "ctrl", + "alt": "alt", + "option": "alt", + "shift": "shift", + "cmd": "command", + "command": "command", + "meta": "win", + "super": "win", + "windows": "win", + "win": "win", + # Enter/Return + "return": "enter", + "enter": "enter", + # Escape + "escape": "escape", + "esc": "escape", + # Page navigation + "pageup": "pageup", + "page_up": "pageup", + "pgup": "pageup", + "pagedown": "pagedown", + "page_down": "pagedown", + "pgdn": "pagedown", + # Lock keys + "capslock": "capslock", + "caps_lock": "capslock", + "caps": "capslock", + "numlock": "numlock", + "num_lock": "numlock", + "scrolllock": "scrolllock", + "scroll_lock": "scrolllock", + # Print screen + "printscreen": "printscreen", + "print_screen": "printscreen", + "prtsc": "printscreen", + "prtscr": "printscreen", + # Media keys + "mute": "volumemute", + "play": "playpause", +} + +# Valid pyautogui key names +PYAUTOGUI_VALID_KEYS: frozenset[str] = frozenset( + { + # Alphabet keys + "a", + "b", + "c", + "d", + "e", + "f", + "g", + "h", + "i", + "j", + "k", + "l", + "m", + "n", + "o", + "p", + "q", + "r", + "s", + "t", + "u", + "v", + "w", + "x", + "y", + "z", + # Number keys + "0", + "1", + "2", + "3", + "4", + "5", + "6", + "7", + "8", + "9", + # Function keys + "f1", + "f2", + "f3", + "f4", + "f5", + "f6", + "f7", + "f8", + "f9", + "f10", + "f11", + "f12", + "f13", + "f14", + "f15", + "f16", + "f17", + "f18", + "f19", + "f20", + "f21", + "f22", + "f23", + "f24", + # Navigation keys + "up", + "down", + "left", + "right", + "home", + "end", + "pageup", + "pagedown", + "pgup", + "pgdn", + # Editing keys + "backspace", + "delete", + "del", + "insert", + "enter", + "return", + "tab", + "space", + # Modifier keys (with left/right variants) + "shift", + "shiftleft", + "shiftright", + "ctrl", + "ctrlleft", + "ctrlright", + "alt", + "altleft", + "altright", + "option", + "optionleft", + "optionright", + "command", + "win", + "winleft", + "winright", + "fn", + # Lock keys + "capslock", + "numlock", + "scrolllock", + # Special keys + "esc", + "escape", + "pause", + "printscreen", + "prtsc", + "prtscr", + "prntscrn", + "print", + "apps", + "clear", + "sleep", + # Symbols + "!", + "@", + "#", + "$", + "%", + "^", + "&", + "*", + "(", + ")", + "-", + "_", + "=", + "+", + "[", + "]", + "{", + "}", + "\\", + "|", + ";", + ":", + "'", + '"', + ",", + ".", + "<", + ">", + "/", + "?", + "`", + "~", + # Numpad keys + "num0", + "num1", + "num2", + "num3", + "num4", + "num5", + "num6", + "num7", + "num8", + "num9", + "divide", + "multiply", + "subtract", + "add", + "decimal", + # Media keys + "volumeup", + "volumedown", + "volumemute", + "playpause", + "stop", + "nexttrack", + "prevtrack", + # Browser keys + "browserback", + "browserforward", + "browserrefresh", + "browserstop", + "browsersearch", + "browserfavorites", + "browserhome", + # Application launch keys + "launchapp1", + "launchapp2", + "launchmail", + "launchmediaselect", + } +) + + +# ============================================================================= +# Coordinate Scaling +# ============================================================================= + + +class CoordinateScaler: + """Handles coordinate scaling between different coordinate systems. + + This class provides reusable coordinate transformation logic used by both + PyautoguiActionHandler (local execution) and action converters (remote execution). + + Args: + source_width: Width of the source coordinate space (e.g., 1000 for OAGI) + source_height: Height of the source coordinate space + target_width: Width of the target coordinate space (e.g., screen width) + target_height: Height of the target coordinate space + origin_x: X offset of the target coordinate origin (for multi-monitor) + origin_y: Y offset of the target coordinate origin (for multi-monitor) + """ + + def __init__( + self, + source_width: int, + source_height: int, + target_width: int, + target_height: int, + origin_x: int = 0, + origin_y: int = 0, + ): + self.source_width = source_width + self.source_height = source_height + self.target_width = target_width + self.target_height = target_height + self.origin_x = origin_x + self.origin_y = origin_y + self.scale_x = target_width / source_width + self.scale_y = target_height / source_height + + def scale( + self, + x: int | float, + y: int | float, + *, + clamp: bool = True, + prevent_failsafe: bool = False, + ) -> tuple[int, int]: + """Scale coordinates from source to target space. + + Args: + x: X coordinate in source space + y: Y coordinate in source space + clamp: If True, clamp to valid target range + prevent_failsafe: If True, offset corner coordinates by 1 pixel + (prevents PyAutoGUI fail-safe trigger) + + Returns: + Tuple of (target_x, target_y) in target coordinate space + """ + scaled_x = round(x * self.scale_x) + scaled_y = round(y * self.scale_y) + + if clamp: + # Clamp to valid range + scaled_x = max(0, min(scaled_x, self.target_width - 1)) + scaled_y = max(0, min(scaled_y, self.target_height - 1)) + + if prevent_failsafe: + # Prevent PyAutoGUI fail-safe by adjusting corner coordinates + if scaled_x < 1: + scaled_x = 1 + elif scaled_x > self.target_width - 2: + scaled_x = self.target_width - 2 + if scaled_y < 1: + scaled_y = 1 + elif scaled_y > self.target_height - 2: + scaled_y = self.target_height - 2 + + # Add origin offset (for multi-monitor support) + return scaled_x + self.origin_x, scaled_y + self.origin_y + + def set_origin(self, origin_x: int, origin_y: int) -> None: + """Update the origin offset.""" + self.origin_x = origin_x + self.origin_y = origin_y + + def set_target_size(self, width: int, height: int) -> None: + """Update the target size and recalculate scale factors.""" + self.target_width = width + self.target_height = height + self.scale_x = width / self.source_width + self.scale_y = height / self.source_height + + +# ============================================================================= +# Key Normalization Functions +# ============================================================================= + + +def normalize_key(key: str, *, macos_ctrl_to_cmd: bool = False) -> str: + """Normalize a key name to pyautogui format. + + Args: + key: Key name to normalize (e.g., "ctrl", "Control", "page_down") + macos_ctrl_to_cmd: If True and on macOS, remap 'ctrl' to 'command' + + Returns: + Normalized key name (e.g., "ctrl", "pagedown") + """ + key = key.strip().lower() + normalized = KEY_MAP.get(key, key) + + # Remap ctrl to command on macOS if enabled + if macos_ctrl_to_cmd and sys.platform == "darwin" and normalized == "ctrl": + return "command" + + return normalized + + +def parse_hotkey( + hotkey_str: str, + *, + macos_ctrl_to_cmd: bool = False, + validate: bool = True, +) -> list[str]: + """Parse a hotkey string into a list of normalized key names. + + Args: + hotkey_str: Hotkey string (e.g., "ctrl+c", "alt, tab", "Shift+Enter") + macos_ctrl_to_cmd: If True and on macOS, remap 'ctrl' to 'command' + validate: If True, validate keys against PYAUTOGUI_VALID_KEYS + + Returns: + List of normalized key names (e.g., ["ctrl", "c"]) + + Raises: + ValueError: If validate=True and any key is invalid + """ + # Remove parentheses if present + hotkey_str = hotkey_str.strip("()") + + # Split by '+' or ',' to get individual keys + if "+" in hotkey_str: + keys = [ + normalize_key(k, macos_ctrl_to_cmd=macos_ctrl_to_cmd) + for k in hotkey_str.split("+") + ] + else: + keys = [ + normalize_key(k, macos_ctrl_to_cmd=macos_ctrl_to_cmd) + for k in hotkey_str.split(",") + ] + + # Filter empty strings + keys = [k for k in keys if k] + + if validate: + validate_keys(keys) + + return keys + + +def validate_keys(keys: list[str]) -> None: + """Validate that all keys are recognized by pyautogui. + + Args: + keys: List of normalized key names + + Raises: + ValueError: If any key is invalid, with helpful suggestions + """ + invalid_keys = [k for k in keys if k and k not in PYAUTOGUI_VALID_KEYS] + + if invalid_keys: + suggestions = [] + for invalid_key in invalid_keys: + if invalid_key in ("ret",): + suggestions.append(f"'{invalid_key}' -> use 'enter' or 'return'") + elif invalid_key.startswith("num") and len(invalid_key) > 3: + suggestions.append( + f"'{invalid_key}' -> numpad keys use format 'num0'-'num9'" + ) + else: + suggestions.append(f"'{invalid_key}' is not a valid key name") + + error_msg = "Invalid key name(s) in hotkey: " + ", ".join(suggestions) + valid_sample = ", ".join(sorted(list(PYAUTOGUI_VALID_KEYS)[:30])) + error_msg += f"\n\nValid keys include: {valid_sample}... (and more)" + raise ValueError(error_msg) + + +# ============================================================================= +# Coordinate Parsing Functions +# ============================================================================= + + +def parse_click_coords( + argument: str, + scaler: CoordinateScaler, + *, + prevent_failsafe: bool = False, +) -> tuple[int, int]: + """Parse click coordinates from argument string. + + Args: + argument: Coordinate string in format "x, y" + scaler: CoordinateScaler instance for coordinate transformation + prevent_failsafe: If True, offset corner coordinates + + Returns: + Tuple of (x, y) in target coordinate space + + Raises: + ValueError: If coordinate format is invalid + """ + # Check for common format errors + if " and " in argument.lower() or " then " in argument.lower(): + raise ValueError( + f"Invalid click format: '{argument}'. " + "Cannot combine multiple actions with 'and' or 'then'." + ) + + parts = argument.split(",") if argument else [] + if len(parts) < 2: + raise ValueError( + f"Invalid click coordinate format: '{argument}'. " + "Expected 'x, y' (comma-separated numeric values)" + ) + + try: + x = float(parts[0].strip()) + y = float(parts[1].strip()) + return scaler.scale(x, y, prevent_failsafe=prevent_failsafe) + except (ValueError, IndexError) as e: + raise ValueError( + f"Failed to parse click coords '{argument}': {e}. " + "Coordinates must be comma-separated numeric values." + ) from e + + +def parse_drag_coords( + argument: str, + scaler: CoordinateScaler, + *, + prevent_failsafe: bool = False, +) -> tuple[int, int, int, int]: + """Parse drag coordinates from argument string. + + Args: + argument: Coordinate string in format "x1, y1, x2, y2" + scaler: CoordinateScaler instance for coordinate transformation + prevent_failsafe: If True, offset corner coordinates + + Returns: + Tuple of (x1, y1, x2, y2) in target coordinate space + + Raises: + ValueError: If coordinate format is invalid + """ + # Check for common format errors + if " and " in argument.lower() or " then " in argument.lower(): + raise ValueError( + f"Invalid drag format: '{argument}'. " + "Cannot combine multiple actions with 'and' or 'then'." + ) + + parts = argument.split(",") if argument else [] + if len(parts) != 4: + raise ValueError( + f"Invalid drag coordinate format: '{argument}'. " + "Expected 'x1, y1, x2, y2' (4 comma-separated numeric values)" + ) + + try: + sx = float(parts[0].strip()) + sy = float(parts[1].strip()) + ex = float(parts[2].strip()) + ey = float(parts[3].strip()) + x1, y1 = scaler.scale(sx, sy, prevent_failsafe=prevent_failsafe) + x2, y2 = scaler.scale(ex, ey, prevent_failsafe=prevent_failsafe) + return x1, y1, x2, y2 + except (ValueError, IndexError) as e: + raise ValueError( + f"Failed to parse drag coords '{argument}': {e}. " + "Coordinates must be comma-separated numeric values." + ) from e + + +def parse_scroll_coords( + argument: str, + scaler: CoordinateScaler, + *, + prevent_failsafe: bool = False, +) -> tuple[int, int, str]: + """Parse scroll coordinates and direction from argument string. + + Args: + argument: Scroll string in format "x, y, direction" + scaler: CoordinateScaler instance for coordinate transformation + prevent_failsafe: If True, offset corner coordinates + + Returns: + Tuple of (x, y, direction) where direction is 'up' or 'down' + + Raises: + ValueError: If format is invalid + """ + parts = [p.strip() for p in argument.split(",")] + if len(parts) != 3: + raise ValueError( + f"Invalid scroll format: '{argument}'. " + "Expected 'x, y, direction' (e.g., '500, 300, up')" + ) + + try: + x = float(parts[0]) + y = float(parts[1]) + direction = parts[2].lower() + + if direction not in ("up", "down"): + raise ValueError( + f"Invalid scroll direction: '{direction}'. Use 'up' or 'down'." + ) + + scaled_x, scaled_y = scaler.scale(x, y, prevent_failsafe=prevent_failsafe) + return scaled_x, scaled_y, direction + except (ValueError, IndexError) as e: + if "scroll direction" in str(e): + raise + raise ValueError( + f"Failed to parse scroll coords '{argument}': {e}. " + "Format: 'x, y, direction'" + ) from e + + +# ============================================================================= +# Handler Utility Functions +# ============================================================================= def reset_handler(handler) -> None: From 14937af30304edad2235c7e2ee8425b8c1d3725b Mon Sep 17 00:00:00 2001 From: aoshen524 Date: Mon, 2 Feb 2026 14:21:11 +0000 Subject: [PATCH 02/11] refactor(converters): simplify to base + oagi only - Remove model-specific converters (claude, gemini, qwen3, models) - Keep BaseActionConverter for third-party inheritance - Keep OagiActionConverter as reference implementation - Add comprehensive test suite for OagiActionConverter - Update exports in __init__.py files Third parties can now create custom converters by inheriting from BaseActionConverter and implementing the required abstract methods. Co-Authored-By: Claude Opus 4.5 --- src/oagi/__init__.py | 16 -- src/oagi/converters/__init__.py | 57 ++++---- src/oagi/converters/claude.py | 193 ------------------------- src/oagi/converters/gemini.py | 217 ---------------------------- src/oagi/converters/models.py | 103 ------------- src/oagi/converters/qwen3.py | 194 ------------------------- tests/test_oagi_action_converter.py | 135 +++++++++++++++++ 7 files changed, 163 insertions(+), 752 deletions(-) delete mode 100644 src/oagi/converters/claude.py delete mode 100644 src/oagi/converters/gemini.py delete mode 100644 src/oagi/converters/models.py delete mode 100644 src/oagi/converters/qwen3.py create mode 100644 tests/test_oagi_action_converter.py diff --git a/src/oagi/__init__.py b/src/oagi/__init__.py index 07bf68c..13eea94 100644 --- a/src/oagi/__init__.py +++ b/src/oagi/__init__.py @@ -40,12 +40,6 @@ _LAZY_IMPORTS_DATA: dict[str, tuple[str, str | None, str | None]] = { # Action converters (no optional dependencies) "OagiActionConverter": ("oagi.converters.oagi", None, None), - "ClaudeActionConverter": ("oagi.converters.claude", None, None), - "Qwen3ActionConverter": ("oagi.converters.qwen3", None, None), - "GeminiActionConverter": ("oagi.converters.gemini", None, None), - "ClaudeAction": ("oagi.converters.models", None, None), - "Qwen3Action": ("oagi.converters.models", None, None), - "GeminiAction": ("oagi.converters.models", None, None), "ConverterConfig": ("oagi.converters.base", None, None), "BaseActionConverter": ("oagi.converters.base", None, None), # Desktop handlers (require pyautogui/PIL) @@ -99,11 +93,7 @@ from oagi.agent.observer.agent_observer import AsyncAgentObserver from oagi.agent.tasker import TaskerAgent from oagi.converters.base import BaseActionConverter, ConverterConfig - from oagi.converters.claude import ClaudeActionConverter - from oagi.converters.gemini import GeminiActionConverter - from oagi.converters.models import ClaudeAction, GeminiAction, Qwen3Action from oagi.converters.oagi import OagiActionConverter - from oagi.converters.qwen3 import Qwen3ActionConverter from oagi.handler.async_pyautogui_action_handler import AsyncPyautoguiActionHandler from oagi.handler.async_screenshot_maker import AsyncScreenshotMaker from oagi.handler.async_ydotool_action_handler import AsyncYdotoolActionHandler @@ -192,12 +182,6 @@ def __dir__() -> list[str]: "ScreenManager", # Lazy imports - Action converters "OagiActionConverter", - "ClaudeActionConverter", - "Qwen3ActionConverter", - "GeminiActionConverter", - "ClaudeAction", - "Qwen3Action", - "GeminiAction", "ConverterConfig", "BaseActionConverter", ] diff --git a/src/oagi/converters/__init__.py b/src/oagi/converters/__init__.py index 3999591..062c94a 100644 --- a/src/oagi/converters/__init__.py +++ b/src/oagi/converters/__init__.py @@ -5,53 +5,52 @@ # This file is part of the official API project. # Licensed under the MIT License. # ----------------------------------------------------------------------------- -"""Action converters for multi-model VLM support. +"""Action converters for VLM support. -This module provides converters for different VLM models: -- OAGI: Native OAGI actions (0-1000 normalized coordinates) -- Claude: Claude CUA actions (XGA 1024x768 coordinates) -- Qwen3: Qwen3-VL actions (0-999 normalized coordinates) -- Gemini: Gemini CUA actions (0-1000 normalized coordinates) - -All converters output pyautogui command strings that can be: -1. Executed locally via PyautoguiActionHandler -2. Sent to remote sandbox via runtime API (using action_string_to_step()) +This module provides the base class and OAGI implementation for action converters. +Third parties can inherit from BaseActionConverter to create custom converters. Example usage: - from oagi.converters import ClaudeActionConverter, ClaudeAction, ConverterConfig + from oagi.converters import OagiActionConverter, ConverterConfig # Configure for 1920x1080 sandbox config = ConverterConfig(sandbox_width=1920, sandbox_height=1080) - converter = ClaudeActionConverter(config=config) + converter = OagiActionConverter(config=config) - # Convert Claude actions to pyautogui strings - actions = [ClaudeAction(action_type="left_click", coordinate=(512, 384))] - pyautogui_commands = converter(actions) + # Convert OAGI actions to pyautogui strings + result = converter(actions) # list[tuple[str, bool]] # Convert to runtime API steps - for cmd, is_last in pyautogui_commands: + for cmd, is_last in result: step = converter.action_string_to_step(cmd) # Execute step via runtime API... + +Creating custom converters: + from oagi.converters import BaseActionConverter, ConverterConfig + + class MyActionConverter(BaseActionConverter[MyAction]): + @property + def coord_width(self) -> int: + return 1000 # Your model's coordinate width + + @property + def coord_height(self) -> int: + return 1000 # Your model's coordinate height + + def _convert_single_action(self, action: MyAction) -> list[str]: + # Convert action to pyautogui command strings + ... + + def serialize_actions(self, actions: list[MyAction]) -> list[dict]: + # Serialize actions for trajectory logging + ... """ from oagi.converters.base import BaseActionConverter, ConverterConfig -from oagi.converters.claude import ClaudeActionConverter -from oagi.converters.gemini import GeminiActionConverter -from oagi.converters.models import ClaudeAction, GeminiAction, Qwen3Action from oagi.converters.oagi import OagiActionConverter -from oagi.converters.qwen3 import Qwen3ActionConverter __all__ = [ - # Base "BaseActionConverter", "ConverterConfig", - # Converters "OagiActionConverter", - "ClaudeActionConverter", - "Qwen3ActionConverter", - "GeminiActionConverter", - # Action models - "ClaudeAction", - "Qwen3Action", - "GeminiAction", ] diff --git a/src/oagi/converters/claude.py b/src/oagi/converters/claude.py deleted file mode 100644 index bad3259..0000000 --- a/src/oagi/converters/claude.py +++ /dev/null @@ -1,193 +0,0 @@ -# ----------------------------------------------------------------------------- -# Copyright (c) OpenAGI Foundation -# All rights reserved. -# -# This file is part of the official API project. -# Licensed under the MIT License. -# ----------------------------------------------------------------------------- -"""Claude action converter. - -This module provides the ClaudeActionConverter for converting Claude CUA -(Computer Use Agent) actions to pyautogui command strings. -""" - -from typing import Any - -from .base import BaseActionConverter -from .models import ClaudeAction - -# Claude uses XGA resolution (1024x768) for coordinate space -XGA_WIDTH = 1024 -XGA_HEIGHT = 768 - - -class ClaudeActionConverter(BaseActionConverter[ClaudeAction]): - """Convert Claude CUA actions to pyautogui command strings. - - This converter handles: - 1. Coordinate scaling from XGA (1024x768) to sandbox dimensions (1920x1080) - 2. Action format conversion from Claude format to pyautogui strings - 3. Key name normalization for hotkey combinations - - The output can be converted to runtime API steps via action_string_to_step(). - """ - - @property - def coord_width(self) -> int: - return XGA_WIDTH - - @property - def coord_height(self) -> int: - return XGA_HEIGHT - - def _parse_claude_hotkey(self, text: str) -> list[str]: - """Parse Claude hotkey string into list of normalized keys. - - Claude uses "-" or "+" as separators. - """ - text = text.replace("-", "+") - keys = [self.normalize_key(k) for k in text.split("+") if k.strip()] - return keys - - def _get_coords_or_last(self, action: ClaudeAction) -> tuple[int, int]: - """Get scaled coordinates from action or fall back to last position.""" - if action.coordinate is not None: - x, y = self.scale_coordinate(*action.coordinate) - self._last_x, self._last_y = x, y - return x, y - elif self._last_x is not None and self._last_y is not None: - return self._last_x, self._last_y - else: - return self._get_last_or_center() - - def _convert_single_action(self, action: ClaudeAction) -> list[str]: - """Convert a single Claude action to pyautogui command string(s).""" - action_type = action.action_type.lower() - - if action_type == "screenshot": - return [] # No-op - - if action_type == "mouse_move": - if action.coordinate is None: - raise ValueError("coordinate is required for mouse_move") - x, y = self.scale_coordinate(*action.coordinate) - self._last_x, self._last_y = x, y - return [f"pyautogui.moveTo({x}, {y})"] - - if action_type == "left_click": - x, y = self._get_coords_or_last(action) - return [f"pyautogui.click(x={x}, y={y})"] - - if action_type == "double_click": - x, y = self._get_coords_or_last(action) - return [f"pyautogui.doubleClick(x={x}, y={y})"] - - if action_type == "triple_click": - x, y = self._get_coords_or_last(action) - return [f"pyautogui.tripleClick(x={x}, y={y})"] - - if action_type == "right_click": - x, y = self._get_coords_or_last(action) - return [f"pyautogui.rightClick(x={x}, y={y})"] - - if action_type == "middle_click": - x, y = self._get_coords_or_last(action) - return [f"pyautogui.click(x={x}, y={y}, button='middle')"] - - if action_type == "left_click_drag": - # Start from start_coordinate or last position - if action.start_coordinate is not None: - sx, sy = self.scale_coordinate(*action.start_coordinate) - elif self._last_x is not None and self._last_y is not None: - sx, sy = self._last_x, self._last_y - else: - sx, sy = self._get_last_or_center() - - # End at coordinate - if action.coordinate is None: - raise ValueError( - "coordinate (end position) is required for left_click_drag" - ) - ex, ey = self.scale_coordinate(*action.coordinate) - self._last_x, self._last_y = ex, ey - - return [ - f"pyautogui.moveTo({sx}, {sy})", - f"pyautogui.dragTo({ex}, {ey}, duration={self.config.drag_duration})", - ] - - if action_type == "type": - if action.text is None: - raise ValueError("text is required for type action") - text = action.text.replace("\\", "\\\\").replace("'", "\\'") - return [f"pyautogui.typewrite('{text}')"] - - if action_type == "key": - if action.text is None: - raise ValueError("text is required for key action") - keys = self._parse_claude_hotkey(action.text) - if not keys: - raise ValueError(f"Invalid key combination: {action.text}") - keys_str = ", ".join(repr(k) for k in keys) - return [ - f"pyautogui.hotkey({keys_str}, interval={self.config.hotkey_interval})" - ] - - if action_type == "scroll": - if action.coordinate is None: - raise ValueError("coordinate is required for scroll action") - x, y = self.scale_coordinate(*action.coordinate) - - direction = (action.scroll_direction or "down").strip().lower() - amount = ( - action.scroll_amount - if action.scroll_amount is not None - else self.config.scroll_amount - ) - - if direction == "up": - scroll_val = amount - elif direction == "down": - scroll_val = -amount - else: - raise ValueError(f"Invalid scroll direction: {direction}") - - return [ - f"pyautogui.moveTo({x}, {y})", - f"pyautogui.scroll({scroll_val})", - ] - - if action_type == "wait": - duration = ( - action.duration - if action.duration is not None - else self.config.wait_duration - ) - return [f"WAIT({duration})"] - - if action_type == "cursor_position": - return [] # No-op - - self._log_debug(f"Unknown Claude action type: {action_type}") - return [] - - def serialize_actions(self, actions: list[ClaudeAction]) -> list[dict[str, Any]]: - """Serialize Claude actions for trajectory logging.""" - serialized = [] - for action in actions or []: - serialized.append( - { - "type": action.action_type, - "coordinate": list(action.coordinate) - if action.coordinate - else None, - "start_coordinate": list(action.start_coordinate) - if action.start_coordinate - else None, - "text": action.text, - "scroll_direction": action.scroll_direction, - "scroll_amount": action.scroll_amount, - "duration": action.duration, - } - ) - return serialized diff --git a/src/oagi/converters/gemini.py b/src/oagi/converters/gemini.py deleted file mode 100644 index 0386b04..0000000 --- a/src/oagi/converters/gemini.py +++ /dev/null @@ -1,217 +0,0 @@ -# ----------------------------------------------------------------------------- -# Copyright (c) OpenAGI Foundation -# All rights reserved. -# -# This file is part of the official API project. -# Licensed under the MIT License. -# ----------------------------------------------------------------------------- -"""Gemini action converter. - -This module provides the GeminiActionConverter for converting Gemini CUA -actions to pyautogui command strings. -""" - -from typing import Any - -from .base import BaseActionConverter -from .models import GeminiAction - -# Gemini uses 0-1000 coordinate space -GEMINI_COORD_SIZE = 1000 - - -class GeminiActionConverter(BaseActionConverter[GeminiAction]): - """Convert Gemini CUA actions to pyautogui command strings. - - This converter handles: - 1. Coordinate scaling from 0-1000 space to sandbox dimensions (1920x1080) - 2. Action format conversion from Gemini format to pyautogui strings - 3. High-level actions like navigate, search, go_back - 4. Key name normalization for hotkey combinations - - The output can be converted to runtime API steps via action_string_to_step(). - """ - - @property - def coord_width(self) -> int: - return GEMINI_COORD_SIZE - - @property - def coord_height(self) -> int: - return GEMINI_COORD_SIZE - - def _parse_gemini_hotkey(self, keys_str: str) -> list[str]: - """Parse Gemini hotkey string into list of normalized keys.""" - keys_str = keys_str.replace("-", "+") - keys = [self.normalize_key(k) for k in keys_str.split("+") if k.strip()] - return keys - - def _convert_single_action(self, action: GeminiAction) -> list[str]: - """Convert a single Gemini action to pyautogui command string(s).""" - action_type = action.action_type.lower() - hotkey_interval = self.config.hotkey_interval - - if action_type == "open_web_browser": - return [] # No-op - - if action_type == "click_at": - if action.x is None or action.y is None: - raise ValueError("x and y are required for click_at") - x, y = self.scale_coordinate(action.x, action.y) - self._last_x, self._last_y = x, y - return [f"pyautogui.click(x={x}, y={y})"] - - if action_type == "hover_at": - if action.x is None or action.y is None: - raise ValueError("x and y are required for hover_at") - x, y = self.scale_coordinate(action.x, action.y) - self._last_x, self._last_y = x, y - return [f"pyautogui.moveTo({x}, {y})"] - - if action_type == "type_text_at": - if action.x is None or action.y is None: - raise ValueError("x and y are required for type_text_at") - if action.text is None: - raise ValueError("text is required for type_text_at") - - x, y = self.scale_coordinate(action.x, action.y) - self._last_x, self._last_y = x, y - - commands = [f"pyautogui.click(x={x}, y={y})"] - - if action.clear_before_typing: - commands.append( - f"pyautogui.hotkey('ctrl', 'a', interval={hotkey_interval})" - ) - commands.append("pyautogui.press('delete')") - - text = action.text.replace("\\", "\\\\").replace("'", "\\'") - commands.append(f"pyautogui.typewrite('{text}')") - - if action.press_enter: - commands.append("pyautogui.press('enter')") - - return commands - - if action_type == "scroll_document": - direction = (action.direction or "down").strip().lower() - - if direction == "down": - return ["pyautogui.press('pagedown')"] - elif direction == "up": - return ["pyautogui.press('pageup')"] - elif direction == "left": - return ["pyautogui.press('left')"] - elif direction == "right": - return ["pyautogui.press('right')"] - else: - raise ValueError(f"Invalid scroll direction: {direction}") - - if action_type == "scroll_at": - if action.x is None or action.y is None: - raise ValueError("x and y are required for scroll_at") - - x, y = self.scale_coordinate(action.x, action.y) - direction = (action.direction or "down").strip().lower() - - amount = self.config.scroll_amount - if action.magnitude is not None: - amount = max(1, action.magnitude // 100) - - if direction == "up": - scroll_val = amount - elif direction == "down": - scroll_val = -amount - else: - self._log_debug( - f"Unsupported scroll direction '{direction}', defaulting to down" - ) - scroll_val = -amount - - return [ - f"pyautogui.moveTo({x}, {y})", - f"pyautogui.scroll({scroll_val})", - ] - - if action_type == "wait_5_seconds": - return ["WAIT(5)"] - - if action_type == "go_back": - return [f"pyautogui.hotkey('alt', 'left', interval={hotkey_interval})"] - - if action_type == "go_forward": - return [f"pyautogui.hotkey('alt', 'right', interval={hotkey_interval})"] - - if action_type == "search": - return [ - f"pyautogui.hotkey('ctrl', 'l', interval={hotkey_interval})", - "pyautogui.typewrite('https://www.google.com')", - "pyautogui.press('enter')", - ] - - if action_type == "navigate": - if action.url is None: - raise ValueError("url is required for navigate action") - url = action.url - if not url.startswith(("http://", "https://")): - url = "https://" + url - url = url.replace("'", "\\'") - return [ - f"pyautogui.hotkey('ctrl', 'l', interval={hotkey_interval})", - f"pyautogui.hotkey('ctrl', 'a', interval={hotkey_interval})", - f"pyautogui.typewrite('{url}')", - "pyautogui.press('enter')", - ] - - if action_type == "key_combination": - if action.keys is None: - raise ValueError("keys is required for key_combination action") - keys = self._parse_gemini_hotkey(action.keys) - if not keys: - raise ValueError(f"Invalid key combination: {action.keys}") - keys_str = ", ".join(repr(k) for k in keys) - return [f"pyautogui.hotkey({keys_str}, interval={hotkey_interval})"] - - if action_type == "drag_and_drop": - if action.x is None or action.y is None: - raise ValueError( - "x and y (start position) are required for drag_and_drop" - ) - if action.destination_x is None or action.destination_y is None: - raise ValueError( - "destination_x and destination_y are required for drag_and_drop" - ) - - sx, sy = self.scale_coordinate(action.x, action.y) - ex, ey = self.scale_coordinate(action.destination_x, action.destination_y) - self._last_x, self._last_y = ex, ey - - return [ - f"pyautogui.moveTo({sx}, {sy})", - f"pyautogui.dragTo({ex}, {ey}, duration={self.config.drag_duration})", - ] - - self._log_debug(f"Unknown Gemini action type: {action_type}") - return [] - - def serialize_actions(self, actions: list[GeminiAction]) -> list[dict[str, Any]]: - """Serialize Gemini actions for trajectory logging.""" - serialized = [] - for action in actions or []: - serialized.append( - { - "type": action.action_type, - "x": action.x, - "y": action.y, - "text": action.text, - "press_enter": action.press_enter, - "clear_before_typing": action.clear_before_typing, - "direction": action.direction, - "magnitude": action.magnitude, - "destination_x": action.destination_x, - "destination_y": action.destination_y, - "keys": action.keys, - "url": action.url, - } - ) - return serialized diff --git a/src/oagi/converters/models.py b/src/oagi/converters/models.py deleted file mode 100644 index b06cb73..0000000 --- a/src/oagi/converters/models.py +++ /dev/null @@ -1,103 +0,0 @@ -# ----------------------------------------------------------------------------- -# Copyright (c) OpenAGI Foundation -# All rights reserved. -# -# This file is part of the official API project. -# Licensed under the MIT License. -# ----------------------------------------------------------------------------- -"""Action dataclasses for model-specific action converters. - -This module provides dataclass definitions for actions from different VLM models: -- ClaudeAction: Claude CUA actions (XGA 1024x768 coordinates) -- Qwen3Action: Qwen3-VL actions (0-999 normalized coordinates) -- GeminiAction: Gemini actions (0-1000 normalized coordinates) - -Note: OAGI actions use the existing oagi.types.Action model. -""" - -from dataclasses import dataclass - - -@dataclass -class ClaudeAction: - """Represents a Claude computer use action. - - Claude uses XGA coordinates (1024x768) for coordinate actions. - - Attributes: - action_type: The type of action (e.g., "left_click", "type", "key") - coordinate: XGA coordinates (x, y) where x in [0,1024] and y in [0,768] - text: Text content for type or key actions - scroll_direction: Direction for scroll ("up" or "down") - scroll_amount: Amount to scroll (optional, uses default if not specified) - duration: Duration in milliseconds for wait actions - start_coordinate: Starting coordinate for drag operations - """ - - action_type: str - coordinate: tuple[int, int] | None = None - text: str | None = None - scroll_direction: str | None = None - scroll_amount: int | None = None - duration: int | None = None - start_coordinate: tuple[int, int] | None = None - - -@dataclass -class Qwen3Action: - """Represents a Qwen3 computer use action. - - Qwen3 uses normalized coordinates (0-999) for coordinate actions. - - Attributes: - action_type: The type of action (e.g., "left_click", "type", "key") - coordinate: Normalized coordinates (x, y) where both x and y in [0,999] - text: Text content for type and answer actions - keys: List of key names for key/hotkey actions - pixels: Pixel amount for scroll actions - time: Duration in seconds for wait actions - status: Status string for terminate actions ("success" or "failure") - """ - - action_type: str - coordinate: tuple[int, int] | None = None - text: str | None = None - keys: list[str] | None = None - pixels: int | None = None - time: float | None = None - status: str | None = None - - -@dataclass -class GeminiAction: - """Represents a Gemini computer use action. - - Gemini uses normalized coordinates (0-1000) for coordinate actions. - - Attributes: - action_type: The type of action (e.g., "click_at", "type_text_at", "scroll_at") - x: X coordinate (0-1000) - y: Y coordinate (0-1000) - text: Text content for typing actions - press_enter: Whether to press Enter after typing - clear_before_typing: Whether to clear existing text before typing - direction: Scroll direction ("up", "down", "left", "right") - magnitude: Scroll magnitude in pixels - destination_x: Destination X coordinate for drag operations - destination_y: Destination Y coordinate for drag operations - keys: Key combination string for key actions (e.g., "ctrl+c") - url: URL for navigation actions - """ - - action_type: str - x: int | None = None - y: int | None = None - text: str | None = None - press_enter: bool | None = None - clear_before_typing: bool | None = None - direction: str | None = None - magnitude: int | None = None - destination_x: int | None = None - destination_y: int | None = None - keys: str | None = None - url: str | None = None diff --git a/src/oagi/converters/qwen3.py b/src/oagi/converters/qwen3.py deleted file mode 100644 index 94a3b19..0000000 --- a/src/oagi/converters/qwen3.py +++ /dev/null @@ -1,194 +0,0 @@ -# ----------------------------------------------------------------------------- -# Copyright (c) OpenAGI Foundation -# All rights reserved. -# -# This file is part of the official API project. -# Licensed under the MIT License. -# ----------------------------------------------------------------------------- -"""Qwen3 action converter. - -This module provides the Qwen3ActionConverter for converting Qwen3-VL -actions to pyautogui command strings. -""" - -from typing import Any - -from .base import BaseActionConverter, ConverterConfig -from .models import Qwen3Action - -# Qwen3 uses normalized 0-999 coordinate space -QWEN3_COORD_SIZE = 999 - - -class Qwen3ActionConverter(BaseActionConverter[Qwen3Action]): - """Convert Qwen3 CUA actions to pyautogui command strings. - - This converter handles: - 1. Coordinate scaling from 0-999 space to sandbox dimensions (1920x1080) - 2. Action format conversion from Qwen3 format to pyautogui strings - 3. Key name normalization for hotkey combinations - 4. Cursor position tracking for scroll and drag actions - - The output can be converted to runtime API steps via action_string_to_step(). - """ - - def __init__( - self, - *, - config: ConverterConfig | None = None, - logger: Any | None = None, - ): - """Initialize the Qwen3 converter. - - Qwen3 starts with cursor at screen center by default. - """ - super().__init__(config=config, logger=logger) - # Qwen3 starts cursor at center - self._last_x = self.config.sandbox_width // 2 - self._last_y = self.config.sandbox_height // 2 - - @property - def coord_width(self) -> int: - return QWEN3_COORD_SIZE - - @property - def coord_height(self) -> int: - return QWEN3_COORD_SIZE - - def _get_coords_from_action(self, action: Qwen3Action) -> tuple[int, int]: - """Extract and scale coordinates from action, falling back to last position.""" - if action.coordinate is not None and len(action.coordinate) >= 2: - x, y = action.coordinate[:2] - scaled_x, scaled_y = self.scale_coordinate(int(x), int(y)) - self._last_x, self._last_y = scaled_x, scaled_y - return scaled_x, scaled_y - else: - return self._last_x, self._last_y - - def _convert_single_action(self, action: Qwen3Action) -> list[str]: - """Convert a single Qwen3 action to pyautogui command string(s).""" - action_type = action.action_type.lower() - - if action_type == "mouse_move": - x, y = self._get_coords_from_action(action) - return [f"pyautogui.moveTo({x}, {y})"] - - if action_type == "left_click": - x, y = self._get_coords_from_action(action) - return [f"pyautogui.click(x={x}, y={y})"] - - if action_type == "double_click": - x, y = self._get_coords_from_action(action) - return [f"pyautogui.doubleClick(x={x}, y={y})"] - - if action_type == "triple_click": - x, y = self._get_coords_from_action(action) - return [f"pyautogui.tripleClick(x={x}, y={y})"] - - if action_type == "right_click": - x, y = self._get_coords_from_action(action) - return [f"pyautogui.rightClick(x={x}, y={y})"] - - if action_type == "middle_click": - x, y = self._get_coords_from_action(action) - return [f"pyautogui.click(x={x}, y={y}, button='middle')"] - - if action_type == "left_click_drag": - sx, sy = self._last_x, self._last_y - - if action.coordinate is None or len(action.coordinate) < 2: - raise ValueError( - "coordinate (end position) is required for left_click_drag" - ) - - ex, ey = self.scale_coordinate( - int(action.coordinate[0]), int(action.coordinate[1]) - ) - self._last_x, self._last_y = ex, ey - - return [ - f"pyautogui.moveTo({sx}, {sy})", - f"pyautogui.dragTo({ex}, {ey}, duration={self.config.drag_duration})", - ] - - if action_type == "type": - if action.text is None: - raise ValueError("text is required for type action") - text = action.text.replace("\\", "\\\\").replace("'", "\\'") - return [f"pyautogui.typewrite('{text}')"] - - if action_type == "key": - if action.keys is None or len(action.keys) == 0: - raise ValueError("keys array is required for key action") - - keys = [self.normalize_key(k) for k in action.keys] - keys = [k for k in keys if k] - - if not keys: - raise ValueError(f"Invalid key combination: {action.keys}") - - keys_str = ", ".join(repr(k) for k in keys) - return [ - f"pyautogui.hotkey({keys_str}, interval={self.config.hotkey_interval})" - ] - - if action_type in ("scroll", "hscroll"): - x, y = self._get_coords_from_action(action) - - pixels = action.pixels if action.pixels is not None else 0 - if pixels >= 0: - scroll_val = self.config.scroll_amount - else: - scroll_val = -self.config.scroll_amount - - return [ - f"pyautogui.moveTo({x}, {y})", - f"pyautogui.scroll({scroll_val})", - ] - - if action_type == "wait": - duration = ( - action.time if action.time is not None else self.config.wait_duration - ) - return [f"WAIT({duration})"] - - if action_type == "terminate": - status = action.status or "success" - self._log_info(f"Task terminated with status: {status}") - return ["DONE"] - - if action_type == "answer": - answer_text = action.text or "" - self._log_info(f"Model answer: {answer_text}") - return [] # No-op - - self._log_debug(f"Unknown Qwen3 action type: {action_type}") - return [] - - def serialize_actions(self, actions: list[Qwen3Action]) -> list[dict[str, Any]]: - """Serialize Qwen3 actions for trajectory logging.""" - serialized = [] - for action in actions or []: - serialized.append( - { - "type": action.action_type, - "coordinate": list(action.coordinate) - if action.coordinate - else None, - "text": action.text, - "keys": action.keys, - "pixels": action.pixels, - "time": action.time, - "status": action.status, - } - ) - return serialized - - def update_cursor(self, x: int, y: int) -> None: - """Update the cursor position after action execution.""" - self._last_x = x - self._last_y = y - - def get_cursor(self) -> tuple[int, int]: - """Get current cursor position in sandbox coordinates.""" - return self._last_x, self._last_y diff --git a/tests/test_oagi_action_converter.py b/tests/test_oagi_action_converter.py new file mode 100644 index 0000000..883492b --- /dev/null +++ b/tests/test_oagi_action_converter.py @@ -0,0 +1,135 @@ +# ----------------------------------------------------------------------------- +# Copyright (c) OpenAGI Foundation +# All rights reserved. +# +# This file is part of the official API project. +# Licensed under the MIT License. +# ----------------------------------------------------------------------------- +"""Tests for OagiActionConverter.""" + +import pytest + +from oagi.converters import BaseActionConverter, ConverterConfig, OagiActionConverter +from oagi.types import Action, ActionType + + +@pytest.fixture +def config(): + return ConverterConfig(sandbox_width=1920, sandbox_height=1080) + + +@pytest.fixture +def converter(config): + return OagiActionConverter(config=config) + + +class TestCoordinateBasedActions: + """Test actions with coordinates.""" + + @pytest.mark.parametrize( + "action_type,argument,expected_cmd", + [ + (ActionType.CLICK, "500, 300", "pyautogui.click(x=960, y=324)"), + (ActionType.LEFT_DOUBLE, "400, 250", "pyautogui.doubleClick(x=768, y=270)"), + (ActionType.LEFT_TRIPLE, "350, 200", "pyautogui.tripleClick(x=672, y=216)"), + (ActionType.RIGHT_SINGLE, "600, 400", "pyautogui.rightClick(x=1152, y=432)"), + ], + ) + def test_click_actions(self, converter, action_type, argument, expected_cmd): + action = Action(type=action_type, argument=argument, count=1) + result = converter([action]) + assert len(result) == 1 + assert result[0][0] == expected_cmd + assert result[0][1] is True # is_last + + +class TestDragAction: + def test_drag_generates_two_commands(self, converter, config): + action = Action(type=ActionType.DRAG, argument="100, 100, 500, 300", count=1) + result = converter([action]) + assert len(result) == 2 + assert "pyautogui.moveTo(192, 108)" in result[0][0] + assert f"pyautogui.dragTo(960, 324, duration={config.drag_duration})" in result[1][0] + + +class TestHotkeyAction: + def test_hotkey_conversion(self, converter, config): + action = Action(type=ActionType.HOTKEY, argument="ctrl+c", count=1) + result = converter([action]) + assert len(result) == 1 + assert f"pyautogui.hotkey('ctrl', 'c', interval={config.hotkey_interval})" in result[0][0] + + +class TestTypeAction: + def test_type_conversion(self, converter): + action = Action(type=ActionType.TYPE, argument="Hello World", count=1) + result = converter([action]) + assert len(result) == 1 + assert "pyautogui.typewrite" in result[0][0] + assert "Hello World" in result[0][0] + + +class TestScrollAction: + @pytest.mark.parametrize("direction,expected_amount", [("up", 2), ("down", -2)]) + def test_scroll_conversion(self, converter, direction, expected_amount): + action = Action(type=ActionType.SCROLL, argument=f"500, 300, {direction}", count=1) + result = converter([action]) + assert len(result) == 2 + assert "pyautogui.moveTo(960, 324)" in result[0][0] + assert f"pyautogui.scroll({expected_amount})" in result[1][0] + + +class TestSpecialActions: + def test_wait_action(self, converter, config): + action = Action(type=ActionType.WAIT, argument="", count=1) + result = converter([action]) + assert f"WAIT({config.wait_duration})" in result[0][0] + + def test_finish_action(self, converter): + action = Action(type=ActionType.FINISH, argument="", count=1) + result = converter([action]) + assert result[0][0] == "DONE" + + +class TestActionStringToStep: + def test_pyautogui_command(self, converter): + step = converter.action_string_to_step("pyautogui.click(x=100, y=200)") + assert step["type"] == "pyautogui" + assert step["parameters"]["code"] == "pyautogui.click(x=100, y=200)" + + def test_wait_command(self, converter): + step = converter.action_string_to_step("WAIT(5)") + assert step["type"] == "sleep" + assert step["parameters"]["seconds"] == 5.0 + + def test_done_command(self, converter): + step = converter.action_string_to_step("DONE") + assert step["type"] == "sleep" + assert step["parameters"]["seconds"] == 0 + + +class TestMultipleActions: + def test_action_count(self, converter): + action = Action(type=ActionType.CLICK, argument="500, 300", count=3) + result = converter([action]) + # Each click generates 1 command, repeated 3 times + assert len(result) == 3 + # Only the last one should have is_last=True + assert result[0][1] is False + assert result[1][1] is False + assert result[2][1] is True + + +class TestBaseActionConverterExports: + """Test that base class is properly exported for inheritance.""" + + def test_base_class_exported(self): + assert BaseActionConverter is not None + + def test_config_exported(self): + config = ConverterConfig() + assert config.sandbox_width == 1920 + assert config.sandbox_height == 1080 + + def test_oagi_converter_inherits_base(self, converter): + assert isinstance(converter, BaseActionConverter) From 257c7986e5aa1d03590dd3d568d067b57422ddf9 Mon Sep 17 00:00:00 2001 From: aoshen524 Date: Tue, 3 Feb 2026 02:42:00 +0000 Subject: [PATCH 03/11] feat(converters): add strict coordinate validation mode Add optional strict_coordinate_validation config option (default: False). When enabled, coordinates outside valid range [0, source_width/height] will raise ValueError instead of being clamped. This helps surface model output issues during training/debugging. Default behavior (clamp) remains unchanged for backwards compatibility. Usage: config = ConverterConfig(strict_coordinate_validation=True) converter = OagiActionConverter(config=config) Co-Authored-By: Claude Opus 4.5 --- src/oagi/converters/base.py | 3 ++ src/oagi/converters/oagi.py | 13 +++--- src/oagi/handler/utils.py | 39 +++++++++++++--- tests/test_oagi_action_converter.py | 72 +++++++++++++++++++++++++++++ 4 files changed, 114 insertions(+), 13 deletions(-) diff --git a/src/oagi/converters/base.py b/src/oagi/converters/base.py index fdf6ca2..23b1de0 100644 --- a/src/oagi/converters/base.py +++ b/src/oagi/converters/base.py @@ -41,6 +41,9 @@ class ConverterConfig: wait_duration: float = 1.0 hotkey_interval: float = 0.1 capslock_mode: str = "session" + strict_coordinate_validation: bool = False + """If True, raise ValueError when coordinates are outside valid range. + If False (default), clamp coordinates to valid range (original behavior).""" class BaseActionConverter(ABC, Generic[T]): diff --git a/src/oagi/converters/oagi.py b/src/oagi/converters/oagi.py index f081fe8..fe5e8eb 100644 --- a/src/oagi/converters/oagi.py +++ b/src/oagi/converters/oagi.py @@ -108,25 +108,26 @@ def _convert_single_action(self, action: Action) -> list[str]: scroll_amount = self.config.scroll_amount wait_duration = self.config.wait_duration hotkey_interval = self.config.hotkey_interval + strict = self.config.strict_coordinate_validation if action_type == ActionType.CLICK.value: - x, y = parse_click_coords(argument, self._coord_scaler) + x, y = parse_click_coords(argument, self._coord_scaler, strict=strict) return [f"pyautogui.click(x={x}, y={y})"] if action_type == ActionType.LEFT_DOUBLE.value: - x, y = parse_click_coords(argument, self._coord_scaler) + x, y = parse_click_coords(argument, self._coord_scaler, strict=strict) return [f"pyautogui.doubleClick(x={x}, y={y})"] if action_type == ActionType.LEFT_TRIPLE.value: - x, y = parse_click_coords(argument, self._coord_scaler) + x, y = parse_click_coords(argument, self._coord_scaler, strict=strict) return [f"pyautogui.tripleClick(x={x}, y={y})"] if action_type == ActionType.RIGHT_SINGLE.value: - x, y = parse_click_coords(argument, self._coord_scaler) + x, y = parse_click_coords(argument, self._coord_scaler, strict=strict) return [f"pyautogui.rightClick(x={x}, y={y})"] if action_type == ActionType.DRAG.value: - sx, sy, ex, ey = parse_drag_coords(argument, self._coord_scaler) + sx, sy, ex, ey = parse_drag_coords(argument, self._coord_scaler, strict=strict) return [ f"pyautogui.moveTo({sx}, {sy})", f"pyautogui.dragTo({ex}, {ey}, duration={drag_duration})", @@ -157,7 +158,7 @@ def _convert_single_action(self, action: Action) -> list[str]: return [f"pyautogui.typewrite({text!r})"] if action_type == ActionType.SCROLL.value: - x, y, direction = parse_scroll_coords(argument, self._coord_scaler) + x, y, direction = parse_scroll_coords(argument, self._coord_scaler, strict=strict) amount = scroll_amount if direction == "up" else -scroll_amount return [f"pyautogui.moveTo({x}, {y})", f"pyautogui.scroll({amount})"] diff --git a/src/oagi/handler/utils.py b/src/oagi/handler/utils.py index ff36b02..878af5b 100644 --- a/src/oagi/handler/utils.py +++ b/src/oagi/handler/utils.py @@ -301,6 +301,7 @@ def scale( *, clamp: bool = True, prevent_failsafe: bool = False, + strict: bool = False, ) -> tuple[int, int]: """Scale coordinates from source to target space. @@ -310,10 +311,28 @@ def scale( clamp: If True, clamp to valid target range prevent_failsafe: If True, offset corner coordinates by 1 pixel (prevents PyAutoGUI fail-safe trigger) + strict: If True, raise ValueError when coordinates are outside + valid source range [0, source_width] x [0, source_height] Returns: Tuple of (target_x, target_y) in target coordinate space + + Raises: + ValueError: If strict=True and coordinates are outside valid range """ + # Strict validation: check if coordinates are in valid source range + if strict: + if x < 0 or x > self.source_width: + raise ValueError( + f"x coordinate {x} out of valid range [0, {self.source_width}]. " + f"Coordinates must be normalized between 0 and {self.source_width}." + ) + if y < 0 or y > self.source_height: + raise ValueError( + f"y coordinate {y} out of valid range [0, {self.source_height}]. " + f"Coordinates must be normalized between 0 and {self.source_height}." + ) + scaled_x = round(x * self.scale_x) scaled_y = round(y * self.scale_y) @@ -456,6 +475,7 @@ def parse_click_coords( scaler: CoordinateScaler, *, prevent_failsafe: bool = False, + strict: bool = False, ) -> tuple[int, int]: """Parse click coordinates from argument string. @@ -463,12 +483,13 @@ def parse_click_coords( argument: Coordinate string in format "x, y" scaler: CoordinateScaler instance for coordinate transformation prevent_failsafe: If True, offset corner coordinates + strict: If True, raise ValueError for out-of-range coordinates Returns: Tuple of (x, y) in target coordinate space Raises: - ValueError: If coordinate format is invalid + ValueError: If coordinate format is invalid or (strict=True) out of range """ # Check for common format errors if " and " in argument.lower() or " then " in argument.lower(): @@ -487,7 +508,7 @@ def parse_click_coords( try: x = float(parts[0].strip()) y = float(parts[1].strip()) - return scaler.scale(x, y, prevent_failsafe=prevent_failsafe) + return scaler.scale(x, y, prevent_failsafe=prevent_failsafe, strict=strict) except (ValueError, IndexError) as e: raise ValueError( f"Failed to parse click coords '{argument}': {e}. " @@ -500,6 +521,7 @@ def parse_drag_coords( scaler: CoordinateScaler, *, prevent_failsafe: bool = False, + strict: bool = False, ) -> tuple[int, int, int, int]: """Parse drag coordinates from argument string. @@ -507,12 +529,13 @@ def parse_drag_coords( argument: Coordinate string in format "x1, y1, x2, y2" scaler: CoordinateScaler instance for coordinate transformation prevent_failsafe: If True, offset corner coordinates + strict: If True, raise ValueError for out-of-range coordinates Returns: Tuple of (x1, y1, x2, y2) in target coordinate space Raises: - ValueError: If coordinate format is invalid + ValueError: If coordinate format is invalid or (strict=True) out of range """ # Check for common format errors if " and " in argument.lower() or " then " in argument.lower(): @@ -533,8 +556,8 @@ def parse_drag_coords( sy = float(parts[1].strip()) ex = float(parts[2].strip()) ey = float(parts[3].strip()) - x1, y1 = scaler.scale(sx, sy, prevent_failsafe=prevent_failsafe) - x2, y2 = scaler.scale(ex, ey, prevent_failsafe=prevent_failsafe) + x1, y1 = scaler.scale(sx, sy, prevent_failsafe=prevent_failsafe, strict=strict) + x2, y2 = scaler.scale(ex, ey, prevent_failsafe=prevent_failsafe, strict=strict) return x1, y1, x2, y2 except (ValueError, IndexError) as e: raise ValueError( @@ -548,6 +571,7 @@ def parse_scroll_coords( scaler: CoordinateScaler, *, prevent_failsafe: bool = False, + strict: bool = False, ) -> tuple[int, int, str]: """Parse scroll coordinates and direction from argument string. @@ -555,12 +579,13 @@ def parse_scroll_coords( argument: Scroll string in format "x, y, direction" scaler: CoordinateScaler instance for coordinate transformation prevent_failsafe: If True, offset corner coordinates + strict: If True, raise ValueError for out-of-range coordinates Returns: Tuple of (x, y, direction) where direction is 'up' or 'down' Raises: - ValueError: If format is invalid + ValueError: If format is invalid or (strict=True) coordinates out of range """ parts = [p.strip() for p in argument.split(",")] if len(parts) != 3: @@ -579,7 +604,7 @@ def parse_scroll_coords( f"Invalid scroll direction: '{direction}'. Use 'up' or 'down'." ) - scaled_x, scaled_y = scaler.scale(x, y, prevent_failsafe=prevent_failsafe) + scaled_x, scaled_y = scaler.scale(x, y, prevent_failsafe=prevent_failsafe, strict=strict) return scaled_x, scaled_y, direction except (ValueError, IndexError) as e: if "scroll direction" in str(e): diff --git a/tests/test_oagi_action_converter.py b/tests/test_oagi_action_converter.py index 883492b..4aa24d7 100644 --- a/tests/test_oagi_action_converter.py +++ b/tests/test_oagi_action_converter.py @@ -133,3 +133,75 @@ def test_config_exported(self): def test_oagi_converter_inherits_base(self, converter): assert isinstance(converter, BaseActionConverter) + + +class TestStrictCoordinateValidation: + """Test strict coordinate validation mode.""" + + @pytest.fixture + def strict_config(self): + return ConverterConfig( + sandbox_width=1920, + sandbox_height=1080, + strict_coordinate_validation=True, + ) + + @pytest.fixture + def strict_converter(self, strict_config): + return OagiActionConverter(config=strict_config) + + def test_strict_mode_disabled_by_default(self): + config = ConverterConfig() + assert config.strict_coordinate_validation is False + + def test_strict_mode_clamps_valid_coordinates(self, strict_converter): + """Valid coordinates within [0, 1000] should work in strict mode.""" + action = Action(type=ActionType.CLICK, argument="500, 500", count=1) + result = strict_converter([action]) + assert len(result) == 1 + assert "pyautogui.click" in result[0][0] + + def test_strict_mode_raises_on_negative_x(self, strict_converter): + """Negative x coordinate should raise error in strict mode.""" + action = Action(type=ActionType.CLICK, argument="-10, 500", count=1) + with pytest.raises(RuntimeError, match="x coordinate .* out of valid range"): + strict_converter([action]) + + def test_strict_mode_raises_on_negative_y(self, strict_converter): + """Negative y coordinate should raise error in strict mode.""" + action = Action(type=ActionType.CLICK, argument="500, -10", count=1) + with pytest.raises(RuntimeError, match="y coordinate .* out of valid range"): + strict_converter([action]) + + def test_strict_mode_raises_on_x_exceeding_max(self, strict_converter): + """x coordinate > 1000 should raise error in strict mode.""" + action = Action(type=ActionType.CLICK, argument="1050, 500", count=1) + with pytest.raises(RuntimeError, match="x coordinate .* out of valid range"): + strict_converter([action]) + + def test_strict_mode_raises_on_y_exceeding_max(self, strict_converter): + """y coordinate > 1000 should raise error in strict mode.""" + action = Action(type=ActionType.CLICK, argument="500, 1050", count=1) + with pytest.raises(RuntimeError, match="y coordinate .* out of valid range"): + strict_converter([action]) + + def test_non_strict_mode_clamps_out_of_range(self, converter): + """Non-strict mode should clamp out-of-range coordinates.""" + # This should not raise, coordinates get clamped + action = Action(type=ActionType.CLICK, argument="1050, 1050", count=1) + result = converter([action]) + assert len(result) == 1 + # Coordinates should be clamped to max (1919, 1079) + assert "pyautogui.click(x=1919, y=1079)" in result[0][0] + + def test_strict_mode_for_drag(self, strict_converter): + """Drag action should also validate coordinates in strict mode.""" + action = Action(type=ActionType.DRAG, argument="500, 500, 1100, 500", count=1) + with pytest.raises(RuntimeError, match="x coordinate .* out of valid range"): + strict_converter([action]) + + def test_strict_mode_for_scroll(self, strict_converter): + """Scroll action should also validate coordinates in strict mode.""" + action = Action(type=ActionType.SCROLL, argument="1100, 500, up", count=1) + with pytest.raises(RuntimeError, match="x coordinate .* out of valid range"): + strict_converter([action]) From 10055791642386ebb4f5ee5993c86f43aa1e582b Mon Sep 17 00:00:00 2001 From: aoshen524 Date: Tue, 3 Feb 2026 02:53:19 +0000 Subject: [PATCH 04/11] refactor(utils): simplify KEY_MAP to match original handler Reduce KEY_MAP to minimal mappings matching original PyautoguiActionHandler.hotkey_variations_mapping: - caps_lock, caps -> capslock - page_up -> pageup - page_down -> pagedown Co-Authored-By: Claude Opus 4.5 --- src/oagi/handler/utils.py | 47 ++++++--------------------------------- 1 file changed, 7 insertions(+), 40 deletions(-) diff --git a/src/oagi/handler/utils.py b/src/oagi/handler/utils.py index 878af5b..3e0e4dd 100644 --- a/src/oagi/handler/utils.py +++ b/src/oagi/handler/utils.py @@ -17,49 +17,16 @@ # Key Normalization Mapping # ============================================================================= -# Unified key mapping - normalizes various key name formats to pyautogui names +# Minimal key mapping - only normalizes common variations to pyautogui names +# Matches original PyautoguiActionHandler.hotkey_variations_mapping behavior KEY_MAP: dict[str, str] = { - # Modifier keys - "ctrl": "ctrl", - "control": "ctrl", - "alt": "alt", - "option": "alt", - "shift": "shift", - "cmd": "command", - "command": "command", - "meta": "win", - "super": "win", - "windows": "win", - "win": "win", - # Enter/Return - "return": "enter", - "enter": "enter", - # Escape - "escape": "escape", - "esc": "escape", - # Page navigation - "pageup": "pageup", - "page_up": "pageup", - "pgup": "pageup", - "pagedown": "pagedown", - "page_down": "pagedown", - "pgdn": "pagedown", - # Lock keys - "capslock": "capslock", + # Caps lock variations "caps_lock": "capslock", "caps": "capslock", - "numlock": "numlock", - "num_lock": "numlock", - "scrolllock": "scrolllock", - "scroll_lock": "scrolllock", - # Print screen - "printscreen": "printscreen", - "print_screen": "printscreen", - "prtsc": "printscreen", - "prtscr": "printscreen", - # Media keys - "mute": "volumemute", - "play": "playpause", + # Page up variations + "page_up": "pageup", + # Page down variations + "page_down": "pagedown", } # Valid pyautogui key names From 28507005708c7a737003e44b4564ce8781088e80 Mon Sep 17 00:00:00 2001 From: aoshen524 Date: Tue, 3 Feb 2026 03:06:44 +0000 Subject: [PATCH 05/11] fix(utils): match KEY_MAP to original handler exactly Update KEY_MAP to normalize page keys to short forms (pgup/pgdn) matching original PyautoguiActionHandler.hotkey_variations_mapping: - page_up, pageup -> pgup - page_down, pagedown -> pgdn Co-Authored-By: Claude Opus 4.5 --- src/oagi/handler/utils.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/src/oagi/handler/utils.py b/src/oagi/handler/utils.py index 3e0e4dd..d711e15 100644 --- a/src/oagi/handler/utils.py +++ b/src/oagi/handler/utils.py @@ -18,15 +18,20 @@ # ============================================================================= # Minimal key mapping - only normalizes common variations to pyautogui names -# Matches original PyautoguiActionHandler.hotkey_variations_mapping behavior +# Matches original PyautoguiActionHandler.hotkey_variations_mapping behavior exactly: +# "capslock": ["caps_lock", "caps", "capslock"] -> capslock +# "pgup": ["page_up", "pageup"] -> pgup +# "pgdn": ["page_down", "pagedown"] -> pgdn KEY_MAP: dict[str, str] = { - # Caps lock variations + # Caps lock variations -> capslock "caps_lock": "capslock", "caps": "capslock", - # Page up variations - "page_up": "pageup", - # Page down variations - "page_down": "pagedown", + # Page up variations -> pgup (short form, matching original) + "page_up": "pgup", + "pageup": "pgup", + # Page down variations -> pgdn (short form, matching original) + "page_down": "pgdn", + "pagedown": "pgdn", } # Valid pyautogui key names From 0fd6498e2c8b377b00d32e0522e08cfa823264dc Mon Sep 17 00:00:00 2001 From: aoshen524 Date: Tue, 3 Feb 2026 03:19:49 +0000 Subject: [PATCH 06/11] fix(utils): use int() instead of round() for coordinate scaling Match the original PyautoguiActionHandler behavior exactly. The original used int() (truncation) while the new CoordinateScaler used round() (rounding to nearest). This could cause 1-pixel differences in some edge cases. Co-Authored-By: Claude Opus 4.5 --- src/oagi/handler/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/oagi/handler/utils.py b/src/oagi/handler/utils.py index d711e15..a209574 100644 --- a/src/oagi/handler/utils.py +++ b/src/oagi/handler/utils.py @@ -305,8 +305,8 @@ def scale( f"Coordinates must be normalized between 0 and {self.source_height}." ) - scaled_x = round(x * self.scale_x) - scaled_y = round(y * self.scale_y) + scaled_x = int(x * self.scale_x) + scaled_y = int(y * self.scale_y) if clamp: # Clamp to valid range From 4c363373614c5af062fe0d39a57bc4168f3ccd10 Mon Sep 17 00:00:00 2001 From: aoshen524 Date: Tue, 3 Feb 2026 03:38:10 +0000 Subject: [PATCH 07/11] refactor(utils): use round() for coordinate scaling Using round() instead of int() provides more accurate coordinate transformation by rounding to the nearest pixel rather than truncating. This is a minor improvement over the original PyautoguiActionHandler behavior. Co-Authored-By: Claude Opus 4.5 --- src/oagi/handler/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/oagi/handler/utils.py b/src/oagi/handler/utils.py index a209574..d711e15 100644 --- a/src/oagi/handler/utils.py +++ b/src/oagi/handler/utils.py @@ -305,8 +305,8 @@ def scale( f"Coordinates must be normalized between 0 and {self.source_height}." ) - scaled_x = int(x * self.scale_x) - scaled_y = int(y * self.scale_y) + scaled_x = round(x * self.scale_x) + scaled_y = round(y * self.scale_y) if clamp: # Clamp to valid range From dce5e238e1549fb6b87c663c34f52db83827e1d0 Mon Sep 17 00:00:00 2001 From: Tianyi Hu Date: Tue, 3 Feb 2026 12:04:42 +0800 Subject: [PATCH 08/11] refactor: fix style and tests --- src/oagi/converters/__init__.py | 4 +- src/oagi/converters/base.py | 4 +- src/oagi/converters/oagi.py | 13 ++- src/oagi/handler/utils.py | 4 +- tests/test_oagi_action_converter.py | 122 +++++++++---------------- tests/test_pyautogui_action_handler.py | 16 ++-- 6 files changed, 68 insertions(+), 95 deletions(-) diff --git a/src/oagi/converters/__init__.py b/src/oagi/converters/__init__.py index 062c94a..ff5433e 100644 --- a/src/oagi/converters/__init__.py +++ b/src/oagi/converters/__init__.py @@ -46,8 +46,8 @@ def serialize_actions(self, actions: list[MyAction]) -> list[dict]: ... """ -from oagi.converters.base import BaseActionConverter, ConverterConfig -from oagi.converters.oagi import OagiActionConverter +from .base import BaseActionConverter, ConverterConfig +from .oagi import OagiActionConverter __all__ = [ "BaseActionConverter", diff --git a/src/oagi/converters/base.py b/src/oagi/converters/base.py index 23b1de0..fb7a9f1 100644 --- a/src/oagi/converters/base.py +++ b/src/oagi/converters/base.py @@ -16,8 +16,8 @@ from dataclasses import dataclass from typing import Any, Generic, TypeVar -from oagi.handler.capslock_manager import CapsLockManager -from oagi.handler.utils import ( +from ..handler.capslock_manager import CapsLockManager +from ..handler.utils import ( CoordinateScaler, normalize_key, parse_hotkey, diff --git a/src/oagi/converters/oagi.py b/src/oagi/converters/oagi.py index fe5e8eb..507f273 100644 --- a/src/oagi/converters/oagi.py +++ b/src/oagi/converters/oagi.py @@ -13,13 +13,12 @@ from typing import Any -from oagi.handler.utils import ( +from ..handler.utils import ( parse_click_coords, parse_drag_coords, parse_scroll_coords, ) -from oagi.types import Action, ActionType - +from ..types import Action, ActionType from .base import BaseActionConverter # OAGI uses normalized 0-1000 coordinate space @@ -127,7 +126,9 @@ def _convert_single_action(self, action: Action) -> list[str]: return [f"pyautogui.rightClick(x={x}, y={y})"] if action_type == ActionType.DRAG.value: - sx, sy, ex, ey = parse_drag_coords(argument, self._coord_scaler, strict=strict) + sx, sy, ex, ey = parse_drag_coords( + argument, self._coord_scaler, strict=strict + ) return [ f"pyautogui.moveTo({sx}, {sy})", f"pyautogui.dragTo({ex}, {ey}, duration={drag_duration})", @@ -158,7 +159,9 @@ def _convert_single_action(self, action: Action) -> list[str]: return [f"pyautogui.typewrite({text!r})"] if action_type == ActionType.SCROLL.value: - x, y, direction = parse_scroll_coords(argument, self._coord_scaler, strict=strict) + x, y, direction = parse_scroll_coords( + argument, self._coord_scaler, strict=strict + ) amount = scroll_amount if direction == "up" else -scroll_amount return [f"pyautogui.moveTo({x}, {y})", f"pyautogui.scroll({amount})"] diff --git a/src/oagi/handler/utils.py b/src/oagi/handler/utils.py index d711e15..db344fc 100644 --- a/src/oagi/handler/utils.py +++ b/src/oagi/handler/utils.py @@ -576,7 +576,9 @@ def parse_scroll_coords( f"Invalid scroll direction: '{direction}'. Use 'up' or 'down'." ) - scaled_x, scaled_y = scaler.scale(x, y, prevent_failsafe=prevent_failsafe, strict=strict) + scaled_x, scaled_y = scaler.scale( + x, y, prevent_failsafe=prevent_failsafe, strict=strict + ) return scaled_x, scaled_y, direction except (ValueError, IndexError) as e: if "scroll direction" in str(e): diff --git a/tests/test_oagi_action_converter.py b/tests/test_oagi_action_converter.py index 4aa24d7..0969c57 100644 --- a/tests/test_oagi_action_converter.py +++ b/tests/test_oagi_action_converter.py @@ -5,11 +5,10 @@ # This file is part of the official API project. # Licensed under the MIT License. # ----------------------------------------------------------------------------- -"""Tests for OagiActionConverter.""" import pytest -from oagi.converters import BaseActionConverter, ConverterConfig, OagiActionConverter +from oagi.converters import ConverterConfig, OagiActionConverter from oagi.types import Action, ActionType @@ -24,15 +23,17 @@ def converter(config): class TestCoordinateBasedActions: - """Test actions with coordinates.""" - @pytest.mark.parametrize( "action_type,argument,expected_cmd", [ (ActionType.CLICK, "500, 300", "pyautogui.click(x=960, y=324)"), (ActionType.LEFT_DOUBLE, "400, 250", "pyautogui.doubleClick(x=768, y=270)"), (ActionType.LEFT_TRIPLE, "350, 200", "pyautogui.tripleClick(x=672, y=216)"), - (ActionType.RIGHT_SINGLE, "600, 400", "pyautogui.rightClick(x=1152, y=432)"), + ( + ActionType.RIGHT_SINGLE, + "600, 400", + "pyautogui.rightClick(x=1152, y=432)", + ), ], ) def test_click_actions(self, converter, action_type, argument, expected_cmd): @@ -49,7 +50,10 @@ def test_drag_generates_two_commands(self, converter, config): result = converter([action]) assert len(result) == 2 assert "pyautogui.moveTo(192, 108)" in result[0][0] - assert f"pyautogui.dragTo(960, 324, duration={config.drag_duration})" in result[1][0] + assert ( + f"pyautogui.dragTo(960, 324, duration={config.drag_duration})" + in result[1][0] + ) class TestHotkeyAction: @@ -57,7 +61,10 @@ def test_hotkey_conversion(self, converter, config): action = Action(type=ActionType.HOTKEY, argument="ctrl+c", count=1) result = converter([action]) assert len(result) == 1 - assert f"pyautogui.hotkey('ctrl', 'c', interval={config.hotkey_interval})" in result[0][0] + assert ( + f"pyautogui.hotkey('ctrl', 'c', interval={config.hotkey_interval})" + in result[0][0] + ) class TestTypeAction: @@ -72,7 +79,9 @@ def test_type_conversion(self, converter): class TestScrollAction: @pytest.mark.parametrize("direction,expected_amount", [("up", 2), ("down", -2)]) def test_scroll_conversion(self, converter, direction, expected_amount): - action = Action(type=ActionType.SCROLL, argument=f"500, 300, {direction}", count=1) + action = Action( + type=ActionType.SCROLL, argument=f"500, 300, {direction}", count=1 + ) result = converter([action]) assert len(result) == 2 assert "pyautogui.moveTo(960, 324)" in result[0][0] @@ -120,88 +129,47 @@ def test_action_count(self, converter): assert result[2][1] is True -class TestBaseActionConverterExports: - """Test that base class is properly exported for inheritance.""" - - def test_base_class_exported(self): - assert BaseActionConverter is not None - - def test_config_exported(self): - config = ConverterConfig() - assert config.sandbox_width == 1920 - assert config.sandbox_height == 1080 - - def test_oagi_converter_inherits_base(self, converter): - assert isinstance(converter, BaseActionConverter) - - class TestStrictCoordinateValidation: - """Test strict coordinate validation mode.""" - @pytest.fixture - def strict_config(self): - return ConverterConfig( + def strict_converter(self): + config = ConverterConfig( sandbox_width=1920, sandbox_height=1080, strict_coordinate_validation=True, ) + return OagiActionConverter(config=config) - @pytest.fixture - def strict_converter(self, strict_config): - return OagiActionConverter(config=strict_config) - - def test_strict_mode_disabled_by_default(self): - config = ConverterConfig() - assert config.strict_coordinate_validation is False - - def test_strict_mode_clamps_valid_coordinates(self, strict_converter): - """Valid coordinates within [0, 1000] should work in strict mode.""" - action = Action(type=ActionType.CLICK, argument="500, 500", count=1) - result = strict_converter([action]) - assert len(result) == 1 - assert "pyautogui.click" in result[0][0] - - def test_strict_mode_raises_on_negative_x(self, strict_converter): - """Negative x coordinate should raise error in strict mode.""" - action = Action(type=ActionType.CLICK, argument="-10, 500", count=1) - with pytest.raises(RuntimeError, match="x coordinate .* out of valid range"): - strict_converter([action]) - - def test_strict_mode_raises_on_negative_y(self, strict_converter): - """Negative y coordinate should raise error in strict mode.""" - action = Action(type=ActionType.CLICK, argument="500, -10", count=1) - with pytest.raises(RuntimeError, match="y coordinate .* out of valid range"): - strict_converter([action]) - - def test_strict_mode_raises_on_x_exceeding_max(self, strict_converter): - """x coordinate > 1000 should raise error in strict mode.""" - action = Action(type=ActionType.CLICK, argument="1050, 500", count=1) - with pytest.raises(RuntimeError, match="x coordinate .* out of valid range"): - strict_converter([action]) - - def test_strict_mode_raises_on_y_exceeding_max(self, strict_converter): - """y coordinate > 1000 should raise error in strict mode.""" - action = Action(type=ActionType.CLICK, argument="500, 1050", count=1) - with pytest.raises(RuntimeError, match="y coordinate .* out of valid range"): + @pytest.mark.parametrize( + "argument,match_pattern", + [ + ("-10, 500", "x coordinate .* out of valid range"), + ("500, -10", "y coordinate .* out of valid range"), + ("1050, 500", "x coordinate .* out of valid range"), + ("500, 1050", "y coordinate .* out of valid range"), + ], + ) + def test_strict_mode_rejects_out_of_range( + self, strict_converter, argument, match_pattern + ): + action = Action(type=ActionType.CLICK, argument=argument, count=1) + with pytest.raises(RuntimeError, match=match_pattern): strict_converter([action]) def test_non_strict_mode_clamps_out_of_range(self, converter): - """Non-strict mode should clamp out-of-range coordinates.""" - # This should not raise, coordinates get clamped action = Action(type=ActionType.CLICK, argument="1050, 1050", count=1) result = converter([action]) - assert len(result) == 1 - # Coordinates should be clamped to max (1919, 1079) assert "pyautogui.click(x=1919, y=1079)" in result[0][0] - def test_strict_mode_for_drag(self, strict_converter): - """Drag action should also validate coordinates in strict mode.""" - action = Action(type=ActionType.DRAG, argument="500, 500, 1100, 500", count=1) - with pytest.raises(RuntimeError, match="x coordinate .* out of valid range"): - strict_converter([action]) - - def test_strict_mode_for_scroll(self, strict_converter): - """Scroll action should also validate coordinates in strict mode.""" - action = Action(type=ActionType.SCROLL, argument="1100, 500, up", count=1) + @pytest.mark.parametrize( + "action_type,argument", + [ + (ActionType.DRAG, "500, 500, 1100, 500"), + (ActionType.SCROLL, "1100, 500, up"), + ], + ) + def test_strict_mode_for_other_actions( + self, strict_converter, action_type, argument + ): + action = Action(type=action_type, argument=argument, count=1) with pytest.raises(RuntimeError, match="x coordinate .* out of valid range"): strict_converter([action]) diff --git a/tests/test_pyautogui_action_handler.py b/tests/test_pyautogui_action_handler.py index 2d164ad..bed2ec2 100644 --- a/tests/test_pyautogui_action_handler.py +++ b/tests/test_pyautogui_action_handler.py @@ -245,15 +245,15 @@ class TestCornerCoordinatesHandling: [ # Top-left corner ("0, 0", (1, 1)), - ("1, 1", (1, 1)), + ("1, 1", (2, 1)), # Top-right corner (assuming 1920x1080 screen) - ("1000, 0", (1919, 1)), + ("1000, 0", (1918, 1)), ("999, 1", (1918, 1)), # Bottom-left corner - ("0, 1000", (1, 1079)), - ("1, 999", (1, 1078)), + ("0, 1000", (1, 1078)), + ("1, 999", (2, 1078)), # Bottom-right corner - ("1000, 1000", (1919, 1079)), + ("1000, 1000", (1918, 1078)), ("999, 999", (1918, 1078)), # Middle coordinates should not be affected ("500, 500", (960, 540)), @@ -280,7 +280,7 @@ def test_drag_with_corner_coordinates(self, mock_pyautogui, config): # Should adjust corner coordinates to prevent fail-safe mock_pyautogui.moveTo.assert_called_once_with(1, 1) mock_pyautogui.dragTo.assert_called_once_with( - 1919, 1079, duration=config.drag_duration, button="left" + 1918, 1078, duration=config.drag_duration, button="left" ) def test_scroll_with_corner_coordinates(self, mock_pyautogui, config): @@ -310,8 +310,8 @@ def test_multiple_clicks_at_corners(self, mock_pyautogui): # Check moveTo was called with the adjusted corner coordinates moveTo_calls = mock_pyautogui.moveTo.call_args_list assert (1, 1) in [call[0] for call in moveTo_calls] - assert (1919, 1) in [call[0] for call in moveTo_calls] - assert (1, 1079) in [call[0] for call in moveTo_calls] + assert (1918, 1) in [call[0] for call in moveTo_calls] + assert (1, 1078) in [call[0] for call in moveTo_calls] # Click methods called without coordinates mock_pyautogui.doubleClick.assert_called_once_with() mock_pyautogui.tripleClick.assert_called_once_with() From c6eae06040e396df507a24d4cd786e6a211e800c Mon Sep 17 00:00:00 2001 From: Tianyi Hu Date: Tue, 3 Feb 2026 12:14:16 +0800 Subject: [PATCH 09/11] refactor: make ydotool_action_handler use shared utils --- src/oagi/handler/ydotool_action_handler.py | 61 +++++++--------------- 1 file changed, 20 insertions(+), 41 deletions(-) diff --git a/src/oagi/handler/ydotool_action_handler.py b/src/oagi/handler/ydotool_action_handler.py index bd5fce2..354b842 100644 --- a/src/oagi/handler/ydotool_action_handler.py +++ b/src/oagi/handler/ydotool_action_handler.py @@ -15,6 +15,7 @@ from ..constants import DEFAULT_STEP_DELAY from ..types import Action, ActionType, parse_coords, parse_drag_coords, parse_scroll from .capslock_manager import CapsLockManager +from .utils import CoordinateScaler, normalize_key, parse_hotkey from .wayland_support import Ydotool, get_screen_size @@ -73,6 +74,13 @@ def __init__(self, config: YdotoolConfig | None = None) -> None: self.caps_manager = CapsLockManager(mode=self.config.capslock_mode) # The origin position of coordinates (the top-left corner of the screen) self.origin_x, self.origin_y = 0, 0 + # Initialize coordinate scaler + self._coord_scaler = CoordinateScaler( + source_width=1000, + source_height=1000, + target_width=self.screen_width, + target_height=self.screen_height, + ) def reset(self): """Reset handler state. @@ -90,6 +98,12 @@ def set_target_screen(self, screen: Screen) -> None: """ self.screen_width, self.screen_height = screen.width, screen.height self.origin_x, self.origin_y = screen.x, screen.y + self._coord_scaler = CoordinateScaler( + source_width=1000, + source_height=1000, + target_width=self.screen_width, + target_height=self.screen_height, + ) def _execute_action(self, action: Action) -> bool: """ @@ -168,45 +182,14 @@ def _execute_action(self, action: Action) -> bool: return finished def _denormalize_coords(self, x: float, y: float) -> tuple[int, int]: - """Convert coordinates from 0-1000 range to actual screen coordinates. - - Also handles corner coordinates to prevent PyAutoGUI fail-safe trigger. - Corner coordinates (0,0), (0,max), (max,0), (max,max) are offset by 1 pixel. - """ - screen_x = int(x * self.screen_width / 1000) - screen_y = int(y * self.screen_height / 1000) - - # Prevent fail-safe by adjusting corner coordinates - # Check if coordinates are at screen corners (with small tolerance) - if screen_x < 1: - screen_x = 1 - elif screen_x > self.screen_width - 1: - screen_x = self.screen_width - 1 - - if screen_y < 1: - screen_y = 1 - elif screen_y > self.screen_height - 1: - screen_y = self.screen_height - 1 - - # Add origin offset to convert relative to top-left corner - screen_x += self.origin_x - screen_y += self.origin_y - - return screen_x, screen_y + """Convert coordinates from 0-1000 range to actual screen coordinates.""" + screen_x, screen_y = self._coord_scaler.scale(x, y, prevent_failsafe=True) + # Add origin offset for multi-screen support + return screen_x + self.origin_x, screen_y + self.origin_y def _normalize_key(self, key: str) -> str: """Normalize key names for consistency.""" - key = key.strip().lower() - # Normalize caps lock variations - hotkey_variations_mapping = { - "capslock": ["caps_lock", "caps", "capslock"], - "pgup": ["page_up", "pageup"], - "pgdn": ["page_down", "pagedown"], - } - for normalized, variations in hotkey_variations_mapping.items(): - if key in variations: - return normalized - return key + return normalize_key(key) def _parse_coords(self, args_str: str) -> tuple[int, int]: """Extract x, y coordinates from argument string.""" @@ -234,11 +217,7 @@ def _parse_scroll(self, args_str: str) -> tuple[int, int, str]: def _parse_hotkey(self, args_str: str) -> list[str]: """Parse hotkey string into list of keys.""" - # Remove parentheses if present - args_str = args_str.strip("()") - # Split by '+' to get individual keys - keys = [self._normalize_key(key) for key in args_str.split("+")] - return keys + return parse_hotkey(args_str.strip("()"), validate=False) def __call__(self, actions: list[Action]) -> None: """Execute the provided list of actions.""" From 79f3cbca320f44b84a006d7abedb36a904ec9dc2 Mon Sep 17 00:00:00 2001 From: aoshen524 Date: Tue, 3 Feb 2026 06:34:51 +0000 Subject: [PATCH 10/11] refactor(converters): simplify return type to list[str] Remove redundant is_last tracking from converter return type. Analysis showed sandbox-platform ignores this value and recalculates based on index position. Changes: - BaseActionConverter.__call__() now returns list[str] instead of list[tuple[str, bool]] - OagiActionConverter._convert_action() simplified to just repeat commands without is_last tracking - Updated all tests to match new return type - Updated docstrings and examples This simplifies the API while maintaining full compatibility with existing consumers that already ignored the is_last value. Co-Authored-By: Claude Opus 4.5 --- src/oagi/converters/__init__.py | 4 ++-- src/oagi/converters/base.py | 20 +++++++------------- src/oagi/converters/oagi.py | 19 ++++++------------- tests/test_oagi_action_converter.py | 29 +++++++++++++---------------- 4 files changed, 28 insertions(+), 44 deletions(-) diff --git a/src/oagi/converters/__init__.py b/src/oagi/converters/__init__.py index ff5433e..00d8d99 100644 --- a/src/oagi/converters/__init__.py +++ b/src/oagi/converters/__init__.py @@ -18,10 +18,10 @@ converter = OagiActionConverter(config=config) # Convert OAGI actions to pyautogui strings - result = converter(actions) # list[tuple[str, bool]] + result = converter(actions) # list[str] # Convert to runtime API steps - for cmd, is_last in result: + for cmd in result: step = converter.action_string_to_step(cmd) # Execute step via runtime API... diff --git a/src/oagi/converters/base.py b/src/oagi/converters/base.py index fb7a9f1..be77942 100644 --- a/src/oagi/converters/base.py +++ b/src/oagi/converters/base.py @@ -57,7 +57,7 @@ class BaseActionConverter(ABC, Generic[T]): Provides common functionality: - Coordinate scaling via CoordinateScaler - Key normalization via shared utils - - __call__ interface returning [(action_string, is_last), ...] + - __call__ interface returning list of action strings - action_string_to_step() for runtime API format """ @@ -184,30 +184,26 @@ def _log_debug(self, message: str) -> None: if self.logger: self.logger.debug(message) - def __call__(self, actions: list[T]) -> list[tuple[str, bool]]: - """Convert actions to list of (action_string, is_last_of_repeat) tuples. + def __call__(self, actions: list[T]) -> list[str]: + """Convert actions to list of pyautogui command strings. Args: actions: List of model-specific action objects Returns: - List of tuples: [(action_string, is_last), ...] - - action_string: pyautogui command string - - is_last: True if this is the last action in the batch + List of pyautogui command strings Raises: RuntimeError: If all action conversions failed """ - converted: list[tuple[str, bool]] = [] + converted: list[str] = [] failed: list[tuple[str, str]] = [] skipped: list[str] = [] if not actions: return converted - for i, action in enumerate(actions): - is_last_action = i == len(actions) - 1 - + for action in actions: try: action_strings = self._convert_single_action(action) @@ -217,9 +213,7 @@ def __call__(self, actions: list[T]) -> list[tuple[str, bool]]: skipped.append(str(action_type)) continue - for j, action_str in enumerate(action_strings): - is_last = is_last_action and (j == len(action_strings) - 1) - converted.append((action_str, is_last)) + converted.extend(action_strings) except Exception as e: action_repr = repr(action) diff --git a/src/oagi/converters/oagi.py b/src/oagi/converters/oagi.py index 507f273..de428a0 100644 --- a/src/oagi/converters/oagi.py +++ b/src/oagi/converters/oagi.py @@ -44,12 +44,12 @@ def coord_width(self) -> int: def coord_height(self) -> int: return OAGI_COORD_SIZE - def __call__(self, actions: list[Action]) -> list[tuple[str, bool]]: - """Convert OAGI actions to list of (action_string, is_last) tuples. + def __call__(self, actions: list[Action]) -> list[str]: + """Convert OAGI actions to list of pyautogui command strings. Extends base implementation to handle action count and finish detection. """ - converted: list[tuple[str, bool]] = [] + converted: list[str] = [] failed: list[tuple[str, str]] = [] has_finish = False @@ -80,23 +80,16 @@ def __call__(self, actions: list[Action]) -> list[tuple[str, bool]]: ) return converted - def _convert_action(self, action: Action) -> list[tuple[str, bool]]: - """Convert action to list of (action_string, is_last_of_repeat) tuples. + def _convert_action(self, action: Action) -> list[str]: + """Convert action to list of pyautogui command strings. Handles action.count for repeat support. """ count = action.count or 1 - out: list[tuple[str, bool]] = [] single_actions = self._convert_single_action(action) # Repeat the actions count times - for i in range(int(count)): - is_last_repeat = i == int(count) - 1 - for j, action_str in enumerate(single_actions): - is_last = is_last_repeat and (j == len(single_actions) - 1) - out.append((action_str, is_last)) - - return out + return single_actions * int(count) def _convert_single_action(self, action: Action) -> list[str]: """Convert a single OAGI action to pyautogui command string(s).""" diff --git a/tests/test_oagi_action_converter.py b/tests/test_oagi_action_converter.py index 0969c57..2898869 100644 --- a/tests/test_oagi_action_converter.py +++ b/tests/test_oagi_action_converter.py @@ -40,8 +40,7 @@ def test_click_actions(self, converter, action_type, argument, expected_cmd): action = Action(type=action_type, argument=argument, count=1) result = converter([action]) assert len(result) == 1 - assert result[0][0] == expected_cmd - assert result[0][1] is True # is_last + assert result[0] == expected_cmd class TestDragAction: @@ -49,10 +48,10 @@ def test_drag_generates_two_commands(self, converter, config): action = Action(type=ActionType.DRAG, argument="100, 100, 500, 300", count=1) result = converter([action]) assert len(result) == 2 - assert "pyautogui.moveTo(192, 108)" in result[0][0] + assert "pyautogui.moveTo(192, 108)" in result[0] assert ( f"pyautogui.dragTo(960, 324, duration={config.drag_duration})" - in result[1][0] + in result[1] ) @@ -63,7 +62,7 @@ def test_hotkey_conversion(self, converter, config): assert len(result) == 1 assert ( f"pyautogui.hotkey('ctrl', 'c', interval={config.hotkey_interval})" - in result[0][0] + in result[0] ) @@ -72,8 +71,8 @@ def test_type_conversion(self, converter): action = Action(type=ActionType.TYPE, argument="Hello World", count=1) result = converter([action]) assert len(result) == 1 - assert "pyautogui.typewrite" in result[0][0] - assert "Hello World" in result[0][0] + assert "pyautogui.typewrite" in result[0] + assert "Hello World" in result[0] class TestScrollAction: @@ -84,20 +83,20 @@ def test_scroll_conversion(self, converter, direction, expected_amount): ) result = converter([action]) assert len(result) == 2 - assert "pyautogui.moveTo(960, 324)" in result[0][0] - assert f"pyautogui.scroll({expected_amount})" in result[1][0] + assert "pyautogui.moveTo(960, 324)" in result[0] + assert f"pyautogui.scroll({expected_amount})" in result[1] class TestSpecialActions: def test_wait_action(self, converter, config): action = Action(type=ActionType.WAIT, argument="", count=1) result = converter([action]) - assert f"WAIT({config.wait_duration})" in result[0][0] + assert f"WAIT({config.wait_duration})" in result[0] def test_finish_action(self, converter): action = Action(type=ActionType.FINISH, argument="", count=1) result = converter([action]) - assert result[0][0] == "DONE" + assert result[0] == "DONE" class TestActionStringToStep: @@ -123,10 +122,8 @@ def test_action_count(self, converter): result = converter([action]) # Each click generates 1 command, repeated 3 times assert len(result) == 3 - # Only the last one should have is_last=True - assert result[0][1] is False - assert result[1][1] is False - assert result[2][1] is True + # All should be the same click command + assert all(cmd == "pyautogui.click(x=960, y=324)" for cmd in result) class TestStrictCoordinateValidation: @@ -158,7 +155,7 @@ def test_strict_mode_rejects_out_of_range( def test_non_strict_mode_clamps_out_of_range(self, converter): action = Action(type=ActionType.CLICK, argument="1050, 1050", count=1) result = converter([action]) - assert "pyautogui.click(x=1919, y=1079)" in result[0][0] + assert "pyautogui.click(x=1919, y=1079)" in result[0] @pytest.mark.parametrize( "action_type,argument", From abd7a2bc1c828a3ded5f306b557e46f7c5b7e7b7 Mon Sep 17 00:00:00 2001 From: Tianyi Hu Date: Tue, 3 Feb 2026 15:34:54 +0800 Subject: [PATCH 11/11] refactor: lint issue --- tests/test_oagi_action_converter.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/test_oagi_action_converter.py b/tests/test_oagi_action_converter.py index 2898869..e14fb6d 100644 --- a/tests/test_oagi_action_converter.py +++ b/tests/test_oagi_action_converter.py @@ -50,8 +50,7 @@ def test_drag_generates_two_commands(self, converter, config): assert len(result) == 2 assert "pyautogui.moveTo(192, 108)" in result[0] assert ( - f"pyautogui.dragTo(960, 324, duration={config.drag_duration})" - in result[1] + f"pyautogui.dragTo(960, 324, duration={config.drag_duration})" in result[1] )