warestack · dkargatzis · Jul 21, 2025 · Jul 20, 2025 · Jul 20, 2025 · Jul 20, 2025
@@ -0,0 +1,42 @@
+name: Tests
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.12"]
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Install uv
+      uses: astral-sh/setup-uv@v3
+      with:
+        version: "latest"
+
+    - name: Set up Python ${{ matrix.python-version }}
+      run: uv python install ${{ matrix.python-version }}
+
+    - name: Install dependencies
+      run: uv sync --all-extras
+
+    - name: Run all tests
+      run: |
+        echo "Running unit tests..."
+        uv run pytest tests/unit/ -v --tb=short
+        echo "Running integration tests (mocked - no real API calls)..."
+        uv run pytest tests/integration/ -v --tb=short
+
+    - name: Upload coverage reports
+      uses: codecov/codecov-action@v4
+      if: matrix.python-version == '3.12'
+      with:
+        file: ./coverage.xml
+        fail_ci_if_error: false
@@ -174,6 +174,7 @@ PLANNING.md
 .pdm-build/
 .ruff_cache/
 .vscode/
+.kiro
 
 # Copilot
 .github/instructions/

@@ -113,6 +113,48 @@ rules:
 2. **Try acknowledgment workflow**: Comment `@watchflow acknowledge` when rules are violated
 3. **Verify rule enforcement**: Check that blocking rules prevent merging
 
+## 🧪 Testing
+
+The project includes comprehensive tests that run **without making real API calls** by default:
+
+### Running Tests
+
+```bash
+# Run all tests (mocked - no API costs)
+pytest
+
+# Run only unit tests (very fast)
+pytest tests/unit/
+
+# Run only integration tests (mocked)
+pytest tests/integration/
+```
+
+### Test Structure
+
+```
+tests/
+├── unit/                     # ⚡ Fast unit tests (mocked OpenAI)
+│   └── test_feasibility_agent.py
+└── integration/              # 🌐 Full HTTP stack tests (mocked OpenAI)
+    └── test_rules_api.py
+```
+
+### Real API Testing (Local Development Only)
+
+If you want to test with **real OpenAI API calls** locally:
+
+```bash
+# Set environment variables
+export OPENAI_API_KEY="your-api-key"
+export INTEGRATION_TEST_REAL_API=true
+
+# Run integration tests with real API calls (costs money!)
+pytest tests/integration/ -m integration
+```
+
+**⚠️ Warning:** Real API tests make actual OpenAI calls and will cost money. They're disabled by default in CI/CD.
+
 ## Configuration
 
 For advanced configuration options, see the [Configuration Guide](docs/getting-started/configuration.md).

@@ -200,13 +200,14 @@ python_functions = ["test_*"]
 addopts = [
     "--strict-markers",
     "--strict-config",
-    "--cov=backend",
+    "--cov=src",
     "--cov-report=term-missing",
     "--cov-report=html",
     "--cov-report=xml",
 ]
 asyncio_mode = "auto"
 
+
 [tool.coverage.run]
 source = ["backend"]
 omit = [

@@ -6,6 +6,5 @@
 """
 
 from .agent import RuleFeasibilityAgent
-from .models import FeasibilityResult
 
-__all__ = ["RuleFeasibilityAgent", "FeasibilityResult"]
+__all__ = ["RuleFeasibilityAgent"]
@@ -8,7 +8,7 @@
 
 from src.agents.base import AgentResult, BaseAgent
 
-from .models import FeasibilityResult, FeasibilityState
+from .models import FeasibilityState
 from .nodes import analyze_rule_feasibility, generate_yaml_config
 
 logger = logging.getLogger(__name__)
@@ -27,51 +27,53 @@ def _build_graph(self) -> StateGraph:
         workflow.add_node("analyze_feasibility", analyze_rule_feasibility)
         workflow.add_node("generate_yaml", generate_yaml_config)
 
-        # Add edges
+        # Add edges with conditional logic
         workflow.add_edge(START, "analyze_feasibility")
-        workflow.add_edge("analyze_feasibility", "generate_yaml")
+
+        # Conditional edge: only generate YAML if feasible
+        workflow.add_conditional_edges(
+            "analyze_feasibility",
+            lambda state: "generate_yaml" if state.is_feasible else END,
+            {"generate_yaml": "generate_yaml", END: END},
+        )
+
         workflow.add_edge("generate_yaml", END)
 
+        logger.info("🔧 FeasibilityAgent graph built with conditional structured output workflow")
         return workflow.compile()
 
     async def execute(self, rule_description: str) -> AgentResult:
         """
         Check if a rule description is feasible and return YAML or feedback.
         """
         try:
+            logger.info(f"🚀 Starting feasibility analysis for rule: {rule_description[:100]}...")
+
             # Prepare initial state
             initial_state = FeasibilityState(rule_description=rule_description)
 
             # Run the graph
             result = await self.graph.ainvoke(initial_state)
 
+            # Convert dict result back to FeasibilityState if needed
+            if isinstance(result, dict):
+                result = FeasibilityState(**result)
+
+            logger.info(f"✅ Feasibility analysis completed: feasible={result.is_feasible}, type={result.rule_type}")
+
             # Convert to AgentResult
             return AgentResult(
-                success=result.get("is_feasible", False),
-                message=result.get("feedback", ""),
+                success=result.is_feasible,
+                message=result.feedback,
                 data={
-                    "is_feasible": result.get("is_feasible", False),
-                    "yaml_content": result.get("yaml_content", ""),
-                    "confidence_score": result.get("confidence_score", 0.0),
-                    "rule_type": result.get("rule_type", ""),
-                    "analysis_steps": result.get("analysis_steps", []),
+                    "is_feasible": result.is_feasible,
+                    "yaml_content": result.yaml_content,
+                    "confidence_score": result.confidence_score,
+                    "rule_type": result.rule_type,
+                    "analysis_steps": result.analysis_steps,
                 },
             )
 
         except Exception as e:
-            logger.error(f"Error in rule feasibility check: {e}")
+            logger.error(f"❌ Error in rule feasibility check: {e}")
             return AgentResult(success=False, message=f"Feasibility check failed: {str(e)}", data={})
-
-    async def check_feasibility(self, rule_description: str) -> FeasibilityResult:
-        """
-        Legacy method for backwards compatibility.
-        """
-        result = await self.execute(rule_description)
-
-        return FeasibilityResult(
-            is_feasible=result.data.get("is_feasible", False),
-            yaml_content=result.data.get("yaml_content", ""),
-            feedback=result.message,
-            confidence_score=result.data.get("confidence_score"),
-            rule_type=result.data.get("rule_type"),
-        )
@@ -5,14 +5,20 @@
 from pydantic import BaseModel, Field
 
 
-class FeasibilityResult(BaseModel):
-    """Result of checking if a rule is feasible."""
-
-    is_feasible: bool
-    yaml_content: str
-    feedback: str
-    confidence_score: float | None = None
-    rule_type: str | None = None
+class FeasibilityAnalysis(BaseModel):
+    """Structured output model for rule feasibility analysis."""
+
+    is_feasible: bool = Field(description="Whether the rule is feasible to implement with Watchflow")
+    rule_type: str = Field(description="Type of rule (time_restriction, branch_pattern, title_pattern, etc.)")
+    confidence_score: float = Field(description="Confidence score from 0.0 to 1.0", ge=0.0, le=1.0)
+    feedback: str = Field(description="Detailed feedback on implementation considerations")
+    analysis_steps: list[str] = Field(description="Step-by-step analysis breakdown", default_factory=list)
+
+
+class YamlGeneration(BaseModel):
+    """Structured output model for YAML configuration generation."""
+
+    yaml_content: str = Field(description="Generated Watchflow YAML rule configuration")
 
 
 class FeasibilityState(BaseModel):

@@ -2,118 +2,94 @@
 LangGraph nodes for the Rule Feasibility Agent.
 """
 
-import json
 import logging
 
 from langchain_openai import ChatOpenAI
 
 from src.core.config import config
 
-from .models import FeasibilityState
+from .models import FeasibilityAnalysis, FeasibilityState, YamlGeneration
 from .prompts import RULE_FEASIBILITY_PROMPT, YAML_GENERATION_PROMPT
 
 logger = logging.getLogger(__name__)
 
 
-def analyze_rule_feasibility(state: FeasibilityState) -> FeasibilityState:
+async def analyze_rule_feasibility(state: FeasibilityState) -> FeasibilityState:
     """
-    Analyze whether a rule description is feasible to implement.
+    Analyze whether a rule description is feasible to implement using structured output.
     """
     try:
-        # Create LLM client directly using centralized config
+        # Create LLM client with structured output
         llm = ChatOpenAI(
             api_key=config.ai.api_key,
             model=config.ai.model,
             max_tokens=config.ai.max_tokens,
             temperature=config.ai.temperature,
         )
 
+        # Use structured output instead of manual JSON parsing
+        structured_llm = llm.with_structured_output(FeasibilityAnalysis)
+
         # Analyze rule feasibility
         prompt = RULE_FEASIBILITY_PROMPT.format(rule_description=state.rule_description)
 
-        response = llm.invoke(prompt)
-
-        # Log the raw response for debugging
-        logger.info(f"Raw LLM response: {response.content}")
-
-        # Check if response is empty
-        if not response.content or response.content.strip() == "":
-            logger.error("LLM returned empty response")
-            state.is_feasible = False
-            state.feedback = "Analysis failed: LLM returned empty response"
-            return state
-
-        # Try to parse JSON with better error handling
-        try:
-            result = json.loads(response.content.strip())
-        except json.JSONDecodeError as json_error:
-            logger.error(f"Failed to parse JSON response: {json_error}")
-            logger.error(f"Response content: {response.content}")
-
-            # Try to extract JSON from markdown code blocks if present
-            content = response.content.strip()
-            if content.startswith("```json"):
-                content = content[7:]  # Remove ```json
-            elif content.startswith("```"):
-                content = content[3:]  # Remove ```
-            if content.endswith("```"):
-                content = content[:-3]  # Remove trailing ```
-
-            try:
-                result = json.loads(content.strip())
-                logger.info("Successfully extracted JSON from markdown code blocks")
-            except json.JSONDecodeError:
-                # If all parsing attempts fail, set default values
-                logger.error("All JSON parsing attempts failed")
-                state.is_feasible = False
-                state.feedback = (
-                    f"Analysis failed: Could not parse LLM response as JSON. Raw response: {response.content[:200]}..."
-                )
-                return state
-
-        # Update state with analysis results
-        state.is_feasible = result.get("is_feasible", False)
-        state.rule_type = result.get("rule_type", "")
-        state.confidence_score = result.get("confidence_score", 0.0)
-        state.yaml_content = result.get("yaml_content", "")
-        state.feedback = result.get("feedback", "")
-        state.analysis_steps = result.get("analysis_steps", [])
-
-        logger.info(f"Rule feasibility analysis completed: {state.is_feasible}")
+        # Get structured response - no more JSON parsing needed!
+        result = await structured_llm.ainvoke(prompt)
+
+        # Update state with analysis results - now type-safe!
+        state.is_feasible = result.is_feasible
+        state.rule_type = result.rule_type
+        state.confidence_score = result.confidence_score
+        state.feedback = result.feedback
+        state.analysis_steps = result.analysis_steps
+
+        logger.info(f"🔍 Rule feasibility analysis completed: {state.is_feasible}")
+        logger.info(f"🔍 Rule type identified: {state.rule_type}")
+        logger.info(f"🔍 Confidence score: {state.confidence_score}")
 
     except Exception as e:
-        logger.error(f"Error in rule feasibility analysis: {e}")
+        logger.error(f"❌ Error in rule feasibility analysis: {e}")
         state.is_feasible = False
         state.feedback = f"Analysis failed: {str(e)}"
+        state.confidence_score = 0.0
 
     return state
 
 
-def generate_yaml_config(state: FeasibilityState) -> FeasibilityState:
+async def generate_yaml_config(state: FeasibilityState) -> FeasibilityState:
     """
-    Generate YAML configuration for feasible rules.
+    Generate YAML configuration for feasible rules using structured output.
+    This node only runs if the rule is feasible.
     """
     if not state.is_feasible or not state.rule_type:
+        logger.info("🔧 Skipping YAML generation - rule not feasible or no rule type")
         return state
 
     try:
-        # Create LLM client directly using centralized config
+        # Create LLM client with structured output
         llm = ChatOpenAI(
             api_key=config.ai.api_key,
             model=config.ai.model,
             max_tokens=config.ai.max_tokens,
             temperature=config.ai.temperature,
         )
 
+        # Use structured output for YAML generation
+        structured_llm = llm.with_structured_output(YamlGeneration)
+
         prompt = YAML_GENERATION_PROMPT.format(rule_type=state.rule_type, rule_description=state.rule_description)
 
-        response = llm.invoke(prompt)
-        state.yaml_content = response.content.strip()
+        # Get structured response
+        result = await structured_llm.ainvoke(prompt)
+
+        # Update state with generated YAML
+        state.yaml_content = result.yaml_content.strip()
 
-        logger.info(f"YAML configuration generated for rule type: {state.rule_type}")
+        logger.info(f"🔧 YAML configuration generated for rule type: {state.rule_type}")
+        logger.info(f"🔧 Generated YAML length: {len(state.yaml_content)} characters")
 
     except Exception as e:
-        logger.error(f"Error generating YAML configuration: {e}")
+        logger.error(f"❌ Error generating YAML configuration: {e}")
         state.feedback += f"\nYAML generation failed: {str(e)}"
 
     return state