diff --git a/src/openparse/processing/__init__.py b/src/openparse/processing/__init__.py index aabb0fe..5eff375 100644 --- a/src/openparse/processing/__init__.py +++ b/src/openparse/processing/__init__.py @@ -2,6 +2,7 @@ IngestionPipeline, BasicIngestionPipeline, SemanticIngestionPipeline, + LocalSemanticIngestionPipeline, NoOpIngestionPipeline, ) from .basic_transforms import ( @@ -15,7 +16,7 @@ CombineNodesSpatially, RemoveNodesBelowNTokens, ) -from .semantic_transforms import CombineNodesSemantically, OpenAIEmbeddings +from .semantic_transforms import CombineNodesSemantically, OpenAIEmbeddings, OllamaEmbeddings __all__ = [ "ProcessingStep", @@ -29,8 +30,10 @@ "BasicIngestionPipeline", "IngestionPipeline", "SemanticIngestionPipeline", + "LocalSemanticIngestionPipeline", "NoOpIngestionPipeline", "RemoveNodesBelowNTokens", "CombineNodesSemantically", "OpenAIEmbeddings", + "OllamaEmbeddings", ] diff --git a/src/openparse/processing/ingest.py b/src/openparse/processing/ingest.py index cef9414..0c58edb 100644 --- a/src/openparse/processing/ingest.py +++ b/src/openparse/processing/ingest.py @@ -17,6 +17,7 @@ from openparse.processing.semantic_transforms import ( CombineNodesSemantically, OpenAIEmbeddings, + OllamaEmbeddings, EmbeddingModel, ) @@ -131,3 +132,45 @@ def __init__( ), RemoveNodesBelowNTokens(min_tokens=min_tokens), ] + +class LocalSemanticIngestionPipeline(IngestionPipeline): + """ + A semantic pipeline for ingesting and processing Nodes using ollama for embeddings. + """ + + def __init__( + self, + url: str = "http://localhost:11434", + model: str = "mxbai-embed-large", + min_tokens: int = consts.TOKENIZATION_LOWER_LIMIT, + max_tokens: int = consts.TOKENIZATION_UPPER_LIMIT, + ) -> None: + embedding_client = OllamaEmbeddings(url=url, model=model) + + self.transformations = [ + RemoveTextInsideTables(), + RemoveFullPageStubs(max_area_pct=0.35), + # mostly aimed at combining bullets and weird formatting + CombineNodesSpatially( + x_error_margin=10, + y_error_margin=2, + criteria="both_small", + ), + CombineHeadingsWithClosestText(), + CombineBullets(), + RemoveMetadataElements(), + RemoveRepeatedElements(threshold=2), + RemoveNodesBelowNTokens(min_tokens=10), + CombineBullets(), + CombineNodesSemantically( + embedding_client=embedding_client, + min_similarity=0.6, + max_tokens=max_tokens // 2, + ), + CombineNodesSemantically( + embedding_client=embedding_client, + min_similarity=0.55, + max_tokens=max_tokens, + ), + RemoveNodesBelowNTokens(min_tokens=min_tokens), + ] diff --git a/src/openparse/processing/semantic_transforms.py b/src/openparse/processing/semantic_transforms.py index d01d085..a2a2fb2 100644 --- a/src/openparse/processing/semantic_transforms.py +++ b/src/openparse/processing/semantic_transforms.py @@ -1,4 +1,8 @@ +import json + from typing import List, Literal, Dict, Union +from urllib.parse import urlparse +from http.client import HTTPConnection, HTTPSConnection import numpy as np @@ -68,6 +72,68 @@ def _create_client(self): return OpenAI(api_key=self.api_key) +class OllamaEmbeddings: + """ + Use local models via ollama for calculating embeddings. Uses the REST API + https://github.com/ollama/ollama/blob/main/docs/api.md. + + * nomic-embed-text + * mxbai-embed-large + """ + + def __init__( + self, + url: str = "http://localhost:11434/", + model: str = "mxbai-embed-large", + batch_size: int = 256, + ): + """ + Used to generate embeddings for Nodes. + """ + self.url = url + self.model = model + self.batch_size = batch_size + + def embed_many(self, texts: List[str]) -> List[List[float]]: + """ + Generate embeddings for a list of texts. Support for batches coming + soon, cf. https://ollama.com/blog/embedding-models + + Args: + texts (list[str]): The list of texts to embed. + batch_size (int): The number of texts to process in each batch. + + Returns: + List[List[float]]: A list of embeddings. + """ + conn = self._create_conn() + res = [] + for i in range(0, len(texts), self.batch_size): + batch_texts = texts[i : i + self.batch_size] + for text in batch_texts: + params = json.dumps({"model": self.model, "prompt": text}) + headers = {"Content-Type": "application/x-www-form-urlencoded", "Accept": "application/json"} + conn.request("POST", "/api/embeddings", params, headers) + response = conn.getresponse() + if response.status != 200: + raise RuntimeError( + "embeddings request failed: {} {}".format( + response.status, response.reason + ) + ) + doc = json.loads(response.read()) + res.extend(doc["embedding"]) + conn.close() + return res + + def _create_conn(self): + parsed = urlparse(self.url) + if parsed.scheme == "https": + return HTTPSConnection(parsed.hostname, parsed.port) + else: + return HTTPConnection(parsed.hostname, parsed.port) + + class CombineNodesSemantically(ProcessingStep): """ Combines nodes that are semantically related.