Add Markdown-KV output format and bump version to 3.8.2

claude · claude · commit 52d089eb51f7 · 2025-11-09T07:39:16.000Z
This commit adds a new output format optimized for LLM understanding and updates the package version to 3.8.2. New Feature: Markdown-KV Output Format - Add 'markdownkv' as a new output format option - Optimized for LLM understanding (60.7% accuracy vs 44.3% for CSV) - Based on research: https://www.empiricalagents.com/blog/which-table-format-do-llms-understand-best - Hierarchical structure with markdown headers and code blocks - Ideal for RAG pipelines and AI systems processing tabular data Implementation: - Update OutputFormatter class to support markdownkv - Add _format_markdownkv() for query results - Add _format_markdownkv_error() for error formatting - Add _format_markdownkv_statement() for statement results - Format: "# Query Results" + "## Record N" + code blocks with key: value pairs - Update StackQL class for server mode support - Handle markdownkv in execute() for queries - Handle markdownkv in executeStmt() for statements - Add comprehensive test suite - tests/test_markdownkv_format.py - Tests for simple data, null values, errors, statements - Tests for LLM-friendly structure validation - Tests for server mode compatibility Version & Documentation: - Bump version from 3.8.1 to 3.8.2 in pyproject.toml - Update CHANGELOG.md with: - Centralized error detection feature - Markdown-KV output format feature - New dependencies (PyYAML) - New test suites This release includes both the error detection feature (previous commits) and the new Markdown-KV format, making pystackql more powerful for AI/LLM use cases and production deployments.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,35 @@
 # Changelog
 
+## v3.8.2 (2025-11-09)
+
+### New Features
+
+- **Centralized Error Detection**: Added centralized error detection system with configurable patterns
+  - New `errors.yaml` configuration file with error patterns
+  - Supports three pattern types: fuzzy matches, exact matches, and regex matches
+  - Automatically detects errors in stdout and moves them to error field
+  - Eliminates need for external applications to parse error messages
+  - Includes patterns for HTTP 4xx/5xx errors, DNS failures, connection errors, and timeouts
+  - Added `ErrorDetector` class for pattern-based error detection
+
+- **Markdown-KV Output Format**: Added `markdownkv` output format optimized for LLM understanding
+  - Based on research showing 60.7% LLM accuracy vs 44.3% for CSV
+  - Ideal for RAG pipelines and AI-based systems processing tabular data
+  - Hierarchical structure with markdown headers and code blocks
+  - Supported in both local and server modes
+  - Reference: [Which Table Format Do LLMs Understand Best?](https://www.empiricalagents.com/blog/which-table-format-do-llms-understand-best)
+
+### Dependencies
+
+- Added `PyYAML>=5.4.0` for error pattern configuration
+
+### Testing
+
+- Added comprehensive test suite for error detection (`tests/test_error_detection.py`)
+- Added test suite for Markdown-KV format (`tests/test_markdownkv_format.py`)
+- Tests for regex pattern matching, DNS errors, connection errors, and timeouts
+- Tests for LLM-friendly data formatting
+
 ## v3.8.1 (2025-06-25)
 
 ### Updates
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "pystackql"
-version = "3.8.1"
+version = "3.8.2"
 description = "A Python interface for StackQL"
 readme = "README.rst"
 authors = [
diff --git a/pystackql/core/output.py b/pystackql/core/output.py
@@ -27,7 +27,7 @@ def __init__(self, output_format='dict'):
         Raises:
             ValueError: If an invalid output format is specified
         """
-        ALLOWED_OUTPUTS = {'dict', 'pandas', 'csv'}
+        ALLOWED_OUTPUTS = {'dict', 'pandas', 'csv', 'markdownkv'}
         if output_format.lower() not in ALLOWED_OUTPUTS:
             raise ValueError(f"Invalid output format. Expected one of {ALLOWED_OUTPUTS}, got {output_format}.")
         self.output_format = output_format.lower()
@@ -63,10 +63,10 @@ def format_query_result(self, result, suppress_errors=True):
     
     def _format_exception(self, exception_msg):
         """Format an exception message.
-        
+
         Args:
             exception_msg (str): The exception message
-            
+
         Returns:
             The formatted exception in the specified output format
         """
@@ -75,15 +75,17 @@ def _format_exception(self, exception_msg):
             return pd.DataFrame({'error': [exception_msg]}) if exception_msg else pd.DataFrame({'error': []})
         elif self.output_format == 'csv':
             return exception_msg
+        elif self.output_format == 'markdownkv':
+            return self._format_markdownkv_error(exception_msg)
         else:  # dict
             return [{"error": exception_msg}]
     
     def _format_error(self, error_msg):
         """Format an error message.
-        
+
         Args:
             error_msg (str): The error message
-            
+
         Returns:
             The formatted error in the specified output format
         """
@@ -92,6 +94,8 @@ def _format_error(self, error_msg):
             return pd.DataFrame({'error': [error_msg]}) if error_msg else pd.DataFrame({'error': []})
         elif self.output_format == 'csv':
             return error_msg
+        elif self.output_format == 'markdownkv':
+            return self._format_markdownkv_error(error_msg)
         else:  # dict
             return [{"error": error_msg}]
     
@@ -118,6 +122,11 @@ def _format_data(self, data):
                 return data  # Return the error message as-is for CSV
             return data
 
+        if self.output_format == 'markdownkv':
+            # For markdownkv, check for errors before parsing
+            if isinstance(data, str) and self.error_detector.is_error(data):
+                return self._format_markdownkv_error(data)
+
         # Check if the raw data string itself is an error message (before JSON parsing)
         if isinstance(data, str) and self.error_detector.is_error(data):
             # The entire response is an error message
@@ -157,6 +166,8 @@ def _format_data(self, data):
                 import pandas as pd
                 # Convert the preprocessed JSON data to a DataFrame
                 return pd.DataFrame(processed_json_data)
+            elif self.output_format == 'markdownkv':
+                return self._format_markdownkv(processed_json_data)
 
             # Return the preprocessed dictionary data
             return processed_json_data
@@ -245,7 +256,7 @@ def _process_sql_types(self, data):
 
     def _format_empty(self):
         """Format an empty result.
-        
+
         Returns:
             An empty result in the specified output format
         """
@@ -254,8 +265,64 @@ def _format_empty(self):
             return pd.DataFrame()
         elif self.output_format == 'csv':
             return ""
+        elif self.output_format == 'markdownkv':
+            return "# Query Results\n\nNo records found.\n"
         else:  # dict
             return []
+
+    def _format_markdownkv(self, data):
+        """Format data as Markdown Key-Value pairs.
+
+        This format is optimized for LLM understanding based on research showing
+        it achieves 60.7% accuracy vs 44.3% for CSV when LLMs process tabular data.
+
+        Args:
+            data: The processed data (list of dicts)
+
+        Returns:
+            str: Markdown-formatted key-value representation
+        """
+        if not data:
+            return "# Query Results\n\nNo records found.\n"
+
+        # Handle single dict (convert to list for consistency)
+        if isinstance(data, dict):
+            data = [data]
+
+        output = ["# Query Results\n"]
+
+        for idx, record in enumerate(data, 1):
+            output.append(f"## Record {idx}\n")
+            output.append("```")
+
+            # Format each key-value pair
+            for key, value in record.items():
+                # Handle None values
+                if value is None:
+                    value = "null"
+                output.append(f"{key}: {value}")
+
+            output.append("```\n")
+
+        return "\n".join(output)
+
+    def _format_markdownkv_error(self, error_msg):
+        """Format an error message in Markdown-KV style.
+
+        Args:
+            error_msg (str): The error message
+
+        Returns:
+            str: Markdown-formatted error
+        """
+        return f"""# Query Results
+
+## Error
+
+```
+error: {error_msg}
+```
+"""
     
     def format_statement_result(self, result):
         """Format a statement result.
@@ -284,5 +351,28 @@ def format_statement_result(self, result):
             return pd.DataFrame({'message': [message]}) if message else pd.DataFrame({'message': []})
         elif self.output_format == 'csv':
             return message
+        elif self.output_format == 'markdownkv':
+            return self._format_markdownkv_statement(message)
         else:  # dict
-            return {'message': message.rstrip('\n')}
+            return {'message': message.rstrip('\n')}
+
+    def _format_markdownkv_statement(self, message):
+        """Format a statement result message in Markdown-KV style.
+
+        Args:
+            message (str): The statement result message
+
+        Returns:
+            str: Markdown-formatted statement result
+        """
+        if not message:
+            return "# Statement Result\n\nNo message returned.\n"
+
+        return f"""# Statement Result
+
+## Result
+
+```
+message: {message.rstrip()}
+```
+"""
diff --git a/pystackql/core/stackql.py b/pystackql/core/stackql.py
@@ -310,6 +310,12 @@ def executeStmt(self, query, custom_auth=None, env_vars=None, **kwargs):
             elif output_format == 'csv':
                 # Return the string representation of the result
                 return result[0]['message']
+            elif output_format == 'markdownkv':
+                from .output import OutputFormatter
+                temp_formatter = OutputFormatter('markdownkv')
+                # Extract message from result
+                message = result[0].get('message', '') if result else ''
+                return temp_formatter._format_markdownkv_statement(message)
             else:
                 return result
         else:
@@ -392,6 +398,10 @@ def execute(self, query, suppress_errors=True, custom_auth=None, env_vars=None,
                 return pd.read_json(StringIO(json_str))
             elif output_format == 'csv':
                 raise ValueError("CSV output is not supported in server_mode.")
+            elif output_format == 'markdownkv':
+                from .output import OutputFormatter
+                temp_formatter = OutputFormatter('markdownkv')
+                return temp_formatter._format_markdownkv(result)
             else:  # Assume 'dict' output
                 return result
         else:
diff --git a/tests/test_markdownkv_format.py b/tests/test_markdownkv_format.py