add more

nicoloboschi · nicoloboschi · commit 75b6711366a6 · 2025-11-18T14:52:35.000+01:00
diff --git a/README.md b/README.md
@@ -11,6 +11,7 @@ Documentation: [docs.vectorize.io](https://docs.vectorize.io/build-deploy/extrac
 
 Traditional OCR tools struggle with complex layouts, poor scans, and structured data. **Iris uses advanced AI** to understand document structure and context, delivering:
 
+- 📄 **Universal format support** - Works with all unstructured document types (PDFs, images, scans, and more)
 - ✨ **High accuracy** - Handles poor quality scans and complex layouts
 - 📊 **Structure preservation** - Maintains tables, lists, and formatting
 - 🎯 **Smart chunking** - Semantic splitting for RAG pipelines
@@ -87,12 +88,17 @@ Split documents into semantic chunks perfect for RAG pipelines:
 - Preserves context across chunks
 
 ### Metadata Extraction
-Extract structured data using natural language:
+Extract structured data using JSON schemas (OpenAPI spec format recommended):
 ```python
 result = extract_text_from_file('invoice.pdf', options=ExtractionOptions(
     metadata_schemas=[{
         'id': 'invoice-data',
-        'schema': 'Extract: invoice_number, date, total_amount, vendor_name'
+        'schema': {
+            'invoice_number': 'string',
+            'date': 'string',
+            'total_amount': 'number',
+            'vendor_name': 'string'
+        }
     }]
 ))
 # Returns structured JSON metadata
@@ -143,23 +149,7 @@ Tables, lists, and other elements are properly extracted.
 Download and extract files directly from HTTP/HTTPS URLs:
 
 ```bash
-vectorize-iris https://example.com/document.pdf
-```
-
-**Output:**
-```
-🚀 Downloading file from URL
-──────────────────────────────────────────────────
-
-✓ Downloaded 2.1 MB to temporary file
-
-✨ Vectorize Iris Extraction
-──────────────────────────────────────────────────
-
-✓ Upload prepared
-✓ File uploaded successfully
-✓ Extraction started
-✓ Extraction completed in 8s
+vectorize-iris https://arxiv.org/pdf/2206.01062
 ```
 
 ### JSON Output (for piping)
@@ -272,6 +262,31 @@ Splits documents at semantic boundaries, perfect for RAG pipelines.
 vectorize-iris report.pdf --parsing-instructions "Extract only tables and numerical data, ignore narrative text"
 ```
 
+### Document Classification
+
+Pass multiple metadata schemas and Iris will automatically classify which schema matches best:
+
+```bash
+vectorize-iris invoice.pdf \
+  --metadata-schema 'invoice:{"invoice_number":"string","date":"string","total_amount":"number","vendor":"string"}' \
+  --metadata-schema 'receipt:{"store_name":"string","date":"string","items":"array","total":"number"}' \
+  --metadata-schema 'contract:{"parties":"array","effective_date":"string","terms":"string"}' \
+  --metadata-schema 'cv:{"name":"string","contact_info":"object","skills":"array","experience":"array"}' \
+  -o json
+```
+
+**Output:**
+```json
+{
+  "success": true,
+  "text": "...",
+  "metadata": "{\"invoice_number\":\"INV-2024-001\",\"date\":\"2024-01-15\",\"total_amount\":1250.00,\"vendor\":\"Acme Corp\"}",
+  "metadataSchema": "invoice"
+}
+```
+
+Iris automatically detected this was an invoice and extracted the relevant fields using the matching schema.
+
 ### Advanced Options
 
 ```bash
diff --git a/examples/classification.sh b/examples/classification.sh
@@ -0,0 +1,73 @@
+#!/bin/bash
+
+# Document Classification Example
+# This example shows how to use multiple metadata schemas to automatically
+# classify documents and extract relevant fields.
+
+echo "==================================================================="
+echo "Document Classification Example"
+echo "==================================================================="
+echo ""
+echo "When you pass multiple metadata schemas, Iris will automatically"
+echo "determine which schema best matches your document and extract"
+echo "fields accordingly."
+echo ""
+
+# Example 1: Single document classification
+echo "Example 1: Classifying a single document"
+echo "-----------------------------------------------------------------"
+echo ""
+echo "Command:"
+echo "  vectorize-iris document.pdf \\"
+echo "    --metadata-schema 'invoice:{\"invoice_number\":\"string\",\"date\":\"string\",\"total_amount\":\"number\",\"vendor\":\"string\"}' \\"
+echo "    --metadata-schema 'receipt:{\"store_name\":\"string\",\"date\":\"string\",\"items\":\"array\",\"total\":\"number\"}' \\"
+echo "    --metadata-schema 'contract:{\"parties\":\"array\",\"effective_date\":\"string\",\"terms\":\"string\"}' \\"
+echo "    -o json"
+echo ""
+echo "Expected output:"
+echo "{"
+echo "  \"success\": true,"
+echo "  \"text\": \"...\","
+echo "  \"metadata\": \"{\\\"invoice_number\\\":\\\"INV-2024-001\\\",\\\"date\\\":\\\"2024-01-15\\\",\\\"total_amount\\\":1250.00,\\\"vendor\\\":\\\"Acme Corp\\\"}\","
+echo "  \"metadataSchema\": \"invoice\""
+echo "}"
+echo ""
+echo "Note: The 'metadataSchema' field tells you which schema matched best."
+echo ""
+
+# Example 2: Batch classification of multiple documents
+echo "Example 2: Batch classification of multiple documents"
+echo "-----------------------------------------------------------------"
+echo ""
+echo "You can classify multiple documents at once:"
+echo ""
+echo "Command:"
+echo "  vectorize-iris ./documents \\"
+echo "    --metadata-schema 'invoice:{\"invoice_number\":\"string\",\"date\":\"string\",\"total_amount\":\"number\",\"vendor\":\"string\"}' \\"
+echo "    --metadata-schema 'receipt:{\"store_name\":\"string\",\"date\":\"string\",\"items\":\"array\",\"total\":\"number\"}' \\"
+echo "    --metadata-schema 'contract:{\"parties\":\"array\",\"effective_date\":\"string\",\"terms\":\"string\"}' \\"
+echo "    -o json -f ./output"
+echo ""
+echo "This will process all documents in the ./documents directory,"
+echo "classify each one, and save the results to ./output with the"
+echo "appropriate schema detected for each document."
+echo ""
+
+# Example 3: Using jq to filter by document type
+echo "Example 3: Using jq to filter classified documents"
+echo "-----------------------------------------------------------------"
+echo ""
+echo "You can pipe the JSON output to jq to filter by document type:"
+echo ""
+echo "Command:"
+echo "  vectorize-iris document.pdf \\"
+echo "    --metadata-schema 'invoice:{\"invoice_number\":\"string\",\"date\":\"string\",\"total_amount\":\"number\",\"vendor\":\"string\"}' \\"
+echo "    --metadata-schema 'receipt:{\"store_name\":\"string\",\"date\":\"string\",\"items\":\"array\",\"total\":\"number\"}' \\"
+echo "    -o json | jq 'select(.metadataSchema == \"invoice\")'"
+echo ""
+echo "This extracts only documents that were classified as invoices."
+echo ""
+
+echo "==================================================================="
+echo "Try it yourself!"
+echo "==================================================================="
diff --git a/nodejs-api/README.md b/nodejs-api/README.md
@@ -301,13 +301,17 @@ import type {
   MetadataExtractionStrategySchema
 } from '@vectorize-io/iris';
 
-// Type-safe options
+// Type-safe options with structured schema (OpenAPI spec format)
 const options: ExtractionOptions = {
   chunkSize: 512,
   parsingInstructions: 'Extract code blocks',
   metadataSchemas: [{
     id: 'doc-meta',
-    schema: 'Extract: title, author, date'
+    schema: {
+      title: 'string',
+      author: 'string',
+      date: 'string'
+    }
   }],
   pollInterval: 2000,
   timeout: 300000
diff --git a/nodejs-api/examples/classification.ts b/nodejs-api/examples/classification.ts
@@ -0,0 +1,163 @@
+/**
+ * Document Classification Example
+ *
+ * This example demonstrates how to use multiple metadata schemas to automatically
+ * classify documents and extract relevant fields.
+ *
+ * When you provide multiple metadata schemas, Iris will:
+ * 1. Analyze the document
+ * 2. Determine which schema best matches the document type
+ * 3. Extract fields according to the matching schema
+ * 4. Return the schema ID in the response
+ */
+
+import { extractTextFromFile, ExtractionOptions } from '@vectorize-io/iris';
+import * as fs from 'fs/promises';
+import * as path from 'path';
+
+// Example 1: Single document classification
+console.log('='.repeat(70));
+console.log('Example 1: Classifying a single document');
+console.log('='.repeat(70));
+console.log();
+
+(async () => {
+    // Define multiple schemas for different document types (JSON objects)
+    const result = await extractTextFromFile('document.pdf', {
+        metadataSchemas: [
+            {
+                id: 'invoice',
+                schema: {
+                    invoice_number: 'string',
+                    date: 'string',
+                    total_amount: 'number',
+                    vendor_name: 'string'
+                }
+            },
+            {
+                id: 'receipt',
+                schema: {
+                    store_name: 'string',
+                    date: 'string',
+                    items: 'array',
+                    total: 'number'
+                }
+            },
+            {
+                id: 'contract',
+                schema: {
+                    parties: 'array',
+                    effective_date: 'string',
+                    terms: 'string'
+                }
+            }
+        ]
+    });
+
+    // Check which schema matched
+    console.log(`Document classified as: ${result.metadataSchema}`);
+    console.log(`Extracted metadata: ${result.metadata}`);
+    console.log();
+
+    // Example 2: Processing multiple documents with classification
+    console.log('='.repeat(70));
+    console.log('Example 2: Batch classification of multiple documents');
+    console.log('='.repeat(70));
+    console.log();
+
+    const documentsDir = './documents';
+    try {
+        const files = await fs.readdir(documentsDir);
+
+        for (const file of files) {
+            const filePath = path.join(documentsDir, file);
+            const stat = await fs.stat(filePath);
+
+            if (stat.isFile()) {
+                const result = await extractTextFromFile(filePath, {
+                    metadataSchemas: [
+                        {
+                            id: 'invoice',
+                            schema: {
+                                invoice_number: 'string',
+                                date: 'string',
+                                total_amount: 'number',
+                                vendor_name: 'string'
+                            }
+                        },
+                        {
+                            id: 'receipt',
+                            schema: {
+                                store_name: 'string',
+                                date: 'string',
+                                items: 'array',
+                                total: 'number'
+                            }
+                        },
+                        {
+                            id: 'contract',
+                            schema: {
+                                parties: 'array',
+                                effective_date: 'string',
+                                terms: 'string'
+                            }
+                        }
+                    ]
+                });
+
+                console.log(`File: ${file}`);
+                console.log(`  Type: ${result.metadataSchema}`);
+                console.log(`  Metadata: ${result.metadata}`);
+                console.log();
+            }
+        }
+    } catch (error) {
+        console.log('Documents directory not found, skipping batch example');
+    }
+
+    // Example 3: Conditional processing based on classification
+    console.log('='.repeat(70));
+    console.log('Example 3: Conditional processing based on document type');
+    console.log('='.repeat(70));
+    console.log();
+
+    const classifiedResult = await extractTextFromFile('document.pdf', {
+        metadataSchemas: [
+            {
+                id: 'invoice',
+                schema: {
+                    invoice_number: 'string',
+                    date: 'string',
+                    total_amount: 'number',
+                    vendor_name: 'string'
+                }
+            },
+            {
+                id: 'receipt',
+                schema: {
+                    store_name: 'string',
+                    date: 'string',
+                    items: 'array',
+                    total: 'number'
+                }
+            }
+        ]
+    });
+
+    // Process differently based on document type
+    switch (classifiedResult.metadataSchema) {
+        case 'invoice':
+            console.log('Processing as invoice...');
+            // Invoice-specific logic here
+            console.log(`Invoice data: ${classifiedResult.metadata}`);
+            break;
+        case 'receipt':
+            console.log('Processing as receipt...');
+            // Receipt-specific logic here
+            console.log(`Receipt data: ${classifiedResult.metadata}`);
+            break;
+        default:
+            console.log('Unknown document type');
+            console.log(`Extracted text: ${classifiedResult.text.substring(0, 200)}...`);
+    }
+})();
diff --git a/python-api/examples/classification.py b/python-api/examples/classification.py
diff --git a/rust-cli/src/main.rs b/rust-cli/src/main.rs