From 46023642aae9c12ff32a7269198b49871ed221f3 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Fri, 12 Apr 2024 11:52:52 +0200 Subject: [PATCH] support embeddings via ollama This add semantic_transforms.OllamaEmbeddings, which allows to calculate embeddings locally using ollama (https://ollama.com/), following the api from OpenAIEmbeddings. Currently, ollama does not support batching (but it is on their roadmap, cf. https://ollama.com/blog/embedding-models). The LocalSemanticIngestionPipeline shows how it can be used. To test locally, install ollama, then pull an embeddings model, such as https://ollama.com/library/mxbai-embed-large, then: from openparse import processing, DocumentParser semantic_pipeline = processing.LocalSemanticIngestionPipeline( url="http://localhost:11434", model="mxbai-embed-large", ) parser = DocumentParser( processing_pipeline=semantic_pipeline, ) parsed = parser.parse("path/to/file.pdf") --- src/openparse/processing/__init__.py | 5 +- src/openparse/processing/ingest.py | 43 ++++++++++++ .../processing/semantic_transforms.py | 66 +++++++++++++++++++ 3 files changed, 113 insertions(+), 1 deletion(-) diff --git a/src/openparse/processing/__init__.py b/src/openparse/processing/__init__.py index aabb0fe..5eff375 100644 --- a/src/openparse/processing/__init__.py +++ b/src/openparse/processing/__init__.py @@ -2,6 +2,7 @@ IngestionPipeline, BasicIngestionPipeline, SemanticIngestionPipeline, + LocalSemanticIngestionPipeline, NoOpIngestionPipeline, ) from .basic_transforms import ( @@ -15,7 +16,7 @@ CombineNodesSpatially, RemoveNodesBelowNTokens, ) -from .semantic_transforms import CombineNodesSemantically, OpenAIEmbeddings +from .semantic_transforms import CombineNodesSemantically, OpenAIEmbeddings, OllamaEmbeddings __all__ = [ "ProcessingStep", @@ -29,8 +30,10 @@ "BasicIngestionPipeline", "IngestionPipeline", "SemanticIngestionPipeline", + "LocalSemanticIngestionPipeline", "NoOpIngestionPipeline", "RemoveNodesBelowNTokens", "CombineNodesSemantically", "OpenAIEmbeddings", + "OllamaEmbeddings", ] diff --git a/src/openparse/processing/ingest.py b/src/openparse/processing/ingest.py index cef9414..0c58edb 100644 --- a/src/openparse/processing/ingest.py +++ b/src/openparse/processing/ingest.py @@ -17,6 +17,7 @@ from openparse.processing.semantic_transforms import ( CombineNodesSemantically, OpenAIEmbeddings, + OllamaEmbeddings, EmbeddingModel, ) @@ -131,3 +132,45 @@ def __init__( ), RemoveNodesBelowNTokens(min_tokens=min_tokens), ] + +class LocalSemanticIngestionPipeline(IngestionPipeline): + """ + A semantic pipeline for ingesting and processing Nodes using ollama for embeddings. + """ + + def __init__( + self, + url: str = "http://localhost:11434", + model: str = "mxbai-embed-large", + min_tokens: int = consts.TOKENIZATION_LOWER_LIMIT, + max_tokens: int = consts.TOKENIZATION_UPPER_LIMIT, + ) -> None: + embedding_client = OllamaEmbeddings(url=url, model=model) + + self.transformations = [ + RemoveTextInsideTables(), + RemoveFullPageStubs(max_area_pct=0.35), + # mostly aimed at combining bullets and weird formatting + CombineNodesSpatially( + x_error_margin=10, + y_error_margin=2, + criteria="both_small", + ), + CombineHeadingsWithClosestText(), + CombineBullets(), + RemoveMetadataElements(), + RemoveRepeatedElements(threshold=2), + RemoveNodesBelowNTokens(min_tokens=10), + CombineBullets(), + CombineNodesSemantically( + embedding_client=embedding_client, + min_similarity=0.6, + max_tokens=max_tokens // 2, + ), + CombineNodesSemantically( + embedding_client=embedding_client, + min_similarity=0.55, + max_tokens=max_tokens, + ), + RemoveNodesBelowNTokens(min_tokens=min_tokens), + ] diff --git a/src/openparse/processing/semantic_transforms.py b/src/openparse/processing/semantic_transforms.py index d01d085..a2a2fb2 100644 --- a/src/openparse/processing/semantic_transforms.py +++ b/src/openparse/processing/semantic_transforms.py @@ -1,4 +1,8 @@ +import json + from typing import List, Literal, Dict, Union +from urllib.parse import urlparse +from http.client import HTTPConnection, HTTPSConnection import numpy as np @@ -68,6 +72,68 @@ def _create_client(self): return OpenAI(api_key=self.api_key) +class OllamaEmbeddings: + """ + Use local models via ollama for calculating embeddings. Uses the REST API + https://github.com/ollama/ollama/blob/main/docs/api.md. + + * nomic-embed-text + * mxbai-embed-large + """ + + def __init__( + self, + url: str = "http://localhost:11434/", + model: str = "mxbai-embed-large", + batch_size: int = 256, + ): + """ + Used to generate embeddings for Nodes. + """ + self.url = url + self.model = model + self.batch_size = batch_size + + def embed_many(self, texts: List[str]) -> List[List[float]]: + """ + Generate embeddings for a list of texts. Support for batches coming + soon, cf. https://ollama.com/blog/embedding-models + + Args: + texts (list[str]): The list of texts to embed. + batch_size (int): The number of texts to process in each batch. + + Returns: + List[List[float]]: A list of embeddings. + """ + conn = self._create_conn() + res = [] + for i in range(0, len(texts), self.batch_size): + batch_texts = texts[i : i + self.batch_size] + for text in batch_texts: + params = json.dumps({"model": self.model, "prompt": text}) + headers = {"Content-Type": "application/x-www-form-urlencoded", "Accept": "application/json"} + conn.request("POST", "/api/embeddings", params, headers) + response = conn.getresponse() + if response.status != 200: + raise RuntimeError( + "embeddings request failed: {} {}".format( + response.status, response.reason + ) + ) + doc = json.loads(response.read()) + res.extend(doc["embedding"]) + conn.close() + return res + + def _create_conn(self): + parsed = urlparse(self.url) + if parsed.scheme == "https": + return HTTPSConnection(parsed.hostname, parsed.port) + else: + return HTTPConnection(parsed.hostname, parsed.port) + + class CombineNodesSemantically(ProcessingStep): """ Combines nodes that are semantically related.