From 2e8871412a1458821ff04c8d402af680f8927ac5 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Thu, 19 Jun 2025 00:29:52 +0000 Subject: [PATCH 1/4] refactor: rename orcabridge to orcapod --- misc/demo_redis_mocking.py | 2 +- src/{orcabridge => orcapod}/__init__.py | 0 src/{orcabridge => orcapod}/core/__init__.py | 0 src/{orcabridge => orcapod}/core/base.py | 6 ++-- src/{orcabridge => orcapod}/core/operators.py | 10 +++--- src/{orcabridge => orcapod}/core/sources.py | 8 ++--- src/{orcabridge => orcapod}/core/streams.py | 4 +-- src/{orcabridge => orcapod}/core/tracker.py | 2 +- src/{orcabridge => orcapod}/dj/__init__.py | 0 src/{orcabridge => orcapod}/dj/mapper.py | 2 +- src/{orcabridge => orcapod}/dj/operation.py | 0 src/{orcabridge => orcapod}/dj/pod.py | 0 src/{orcabridge => orcapod}/dj/source.py | 6 ++-- src/{orcabridge => orcapod}/dj/stream.py | 2 +- src/{orcabridge => orcapod}/dj/tracker.py | 8 ++--- .../hashing/__init__.py | 0 src/{orcabridge => orcapod}/hashing/core.py | 4 +-- .../hashing/defaults.py | 14 ++++---- .../hashing/file_hashers.py | 6 ++-- .../hashing/function_info_extractors.py | 2 +- .../hashing/hashing_legacy.py | 0 .../hashing/object_hashers.py | 0 .../hashing/semantic_arrow_hasher.py | 0 .../hashing/string_cachers.py | 2 +- src/{orcabridge => orcapod}/hashing/types.py | 2 +- .../pipeline/pipeline.py | 8 ++--- src/{orcabridge => orcapod}/pod/__init__.py | 0 src/{orcabridge => orcapod}/pod/core.py | 20 +++++------ src/{orcabridge => orcapod}/py.typed | 0 src/{orcabridge => orcapod}/store/__init__.py | 0 .../store/arrow_data_stores.py | 2 +- src/{orcabridge => orcapod}/store/core.py | 10 +++--- src/{orcabridge => orcapod}/store/file.py | 2 +- src/{orcabridge => orcapod}/store/file_ops.py | 2 +- .../store/safe_dir_data_store.py | 2 +- src/{orcabridge => orcapod}/store/transfer.py | 4 +-- src/{orcabridge => orcapod}/store/types.py | 2 +- src/{orcabridge => orcapod}/types/__init__.py | 0 src/{orcabridge => orcapod}/types/core.py | 0 src/{orcabridge => orcapod}/types/default.py | 0 src/{orcabridge => orcapod}/types/handlers.py | 0 .../types/inference.py | 0 src/{orcabridge => orcapod}/types/registry.py | 2 +- src/{orcabridge => orcapod}/types/utils.py | 0 src/{orcabridge => orcapod}/utils/__init__.py | 0 src/{orcabridge => orcapod}/utils/name.py | 0 .../utils/stream_utils.py | 2 +- tests/test_hashing/generate_file_hashes.py | 2 +- tests/test_hashing/generate_hash_examples.py | 2 +- .../generate_pathset_packet_hashes.py | 2 +- .../test_basic_composite_hasher.py | 2 +- tests/test_hashing/test_basic_hashing.py | 2 +- tests/test_hashing/test_cached_file_hasher.py | 6 ++-- tests/test_hashing/test_composite_hasher.py | 13 ++++--- tests/test_hashing/test_file_hashes.py | 2 +- tests/test_hashing/test_hash_samples.py | 2 +- tests/test_hashing/test_hasher_factory.py | 4 +-- tests/test_hashing/test_hasher_parity.py | 4 +-- tests/test_hashing/test_packet_hasher.py | 4 +-- tests/test_hashing/test_path_set_hasher.py | 17 +++++---- tests/test_hashing/test_pathset_and_packet.py | 2 +- .../test_pathset_packet_hashes.py | 2 +- tests/test_hashing/test_process_structure.py | 2 +- tests/test_hashing/test_sqlite_cacher.py | 2 +- .../test_string_cacher/test_file_cacher.py | 2 +- .../test_in_memory_cacher.py | 2 +- .../test_string_cacher/test_redis_cacher.py | 36 +++++++++---------- .../test_string_cacher/test_sqlite_cacher.py | 2 +- tests/test_store/test_dir_data_store.py | 10 +++--- tests/test_store/test_integration.py | 6 ++-- tests/test_store/test_noop_data_store.py | 4 +-- tests/test_store/test_transfer_data_store.py | 6 ++-- .../test_extract_function_data_types.py | 2 +- 73 files changed, 137 insertions(+), 139 deletions(-) rename src/{orcabridge => orcapod}/__init__.py (100%) rename src/{orcabridge => orcapod}/core/__init__.py (100%) rename src/{orcabridge => orcapod}/core/base.py (99%) rename src/{orcabridge => orcapod}/core/operators.py (99%) rename src/{orcabridge => orcapod}/core/sources.py (96%) rename src/{orcabridge => orcapod}/core/streams.py (96%) rename src/{orcabridge => orcapod}/core/tracker.py (97%) rename src/{orcabridge => orcapod}/dj/__init__.py (100%) rename src/{orcabridge => orcapod}/dj/mapper.py (98%) rename src/{orcabridge => orcapod}/dj/operation.py (100%) rename src/{orcabridge => orcapod}/dj/pod.py (100%) rename src/{orcabridge => orcapod}/dj/source.py (99%) rename src/{orcabridge => orcapod}/dj/stream.py (99%) rename src/{orcabridge => orcapod}/dj/tracker.py (96%) rename src/{orcabridge => orcapod}/hashing/__init__.py (100%) rename src/{orcabridge => orcapod}/hashing/core.py (99%) rename src/{orcabridge => orcapod}/hashing/defaults.py (72%) rename src/{orcabridge => orcapod}/hashing/file_hashers.py (96%) rename src/{orcabridge => orcapod}/hashing/function_info_extractors.py (98%) rename src/{orcabridge => orcapod}/hashing/hashing_legacy.py (100%) rename src/{orcabridge => orcapod}/hashing/object_hashers.py (100%) rename src/{orcabridge => orcapod}/hashing/semantic_arrow_hasher.py (100%) rename src/{orcabridge => orcapod}/hashing/string_cachers.py (99%) rename src/{orcabridge => orcapod}/hashing/types.py (98%) rename src/{orcabridge => orcapod}/pipeline/pipeline.py (99%) rename src/{orcabridge => orcapod}/pod/__init__.py (100%) rename src/{orcabridge => orcapod}/pod/core.py (98%) rename src/{orcabridge => orcapod}/py.typed (100%) rename src/{orcabridge => orcapod}/store/__init__.py (100%) rename src/{orcabridge => orcapod}/store/arrow_data_stores.py (99%) rename src/{orcabridge => orcapod}/store/core.py (97%) rename src/{orcabridge => orcapod}/store/file.py (99%) rename src/{orcabridge => orcapod}/store/file_ops.py (99%) rename src/{orcabridge => orcapod}/store/safe_dir_data_store.py (99%) rename src/{orcabridge => orcapod}/store/transfer.py (96%) rename src/{orcabridge => orcapod}/store/types.py (98%) rename src/{orcabridge => orcapod}/types/__init__.py (100%) rename src/{orcabridge => orcapod}/types/core.py (100%) rename src/{orcabridge => orcapod}/types/default.py (100%) rename src/{orcabridge => orcapod}/types/handlers.py (100%) rename src/{orcabridge => orcapod}/types/inference.py (100%) rename src/{orcabridge => orcapod}/types/registry.py (99%) rename src/{orcabridge => orcapod}/types/utils.py (100%) rename src/{orcabridge => orcapod}/utils/__init__.py (100%) rename src/{orcabridge => orcapod}/utils/name.py (100%) rename src/{orcabridge => orcapod}/utils/stream_utils.py (98%) diff --git a/misc/demo_redis_mocking.py b/misc/demo_redis_mocking.py index 7ebdd8f..cc18dcb 100644 --- a/misc/demo_redis_mocking.py +++ b/misc/demo_redis_mocking.py @@ -79,7 +79,7 @@ def demonstrate_redis_mocking(): MockConnectionError, ), ): - from orcabridge.hashing.string_cachers import RedisCacher + from orcapod.hashing.string_cachers import RedisCacher # Create a mock Redis instance mock_redis = MockRedis() diff --git a/src/orcabridge/__init__.py b/src/orcapod/__init__.py similarity index 100% rename from src/orcabridge/__init__.py rename to src/orcapod/__init__.py diff --git a/src/orcabridge/core/__init__.py b/src/orcapod/core/__init__.py similarity index 100% rename from src/orcabridge/core/__init__.py rename to src/orcapod/core/__init__.py diff --git a/src/orcabridge/core/base.py b/src/orcapod/core/base.py similarity index 99% rename from src/orcabridge/core/base.py rename to src/orcapod/core/base.py index 7c025e6..0b1ed63 100644 --- a/src/orcabridge/core/base.py +++ b/src/orcapod/core/base.py @@ -5,9 +5,9 @@ from typing import Any, TypeVar, Hashable -from orcabridge.hashing import HashableMixin -from orcabridge.types import Packet, Tag, TypeSpec -from orcabridge.utils.stream_utils import get_typespec +from orcapod.hashing import HashableMixin +from orcapod.types import Packet, Tag, TypeSpec +from orcapod.utils.stream_utils import get_typespec import logging diff --git a/src/orcabridge/core/operators.py b/src/orcapod/core/operators.py similarity index 99% rename from src/orcabridge/core/operators.py rename to src/orcapod/core/operators.py index 04a2795..093167b 100644 --- a/src/orcabridge/core/operators.py +++ b/src/orcapod/core/operators.py @@ -4,10 +4,10 @@ from typing import Any -from orcabridge.core.base import Operator, SyncStream -from orcabridge.hashing import function_content_hash, hash_function -from orcabridge.core.streams import SyncStreamFromGenerator -from orcabridge.utils.stream_utils import ( +from orcapod.core.base import Operator, SyncStream +from orcapod.hashing import function_content_hash, hash_function +from orcapod.core.streams import SyncStreamFromGenerator +from orcapod.utils.stream_utils import ( batch_packet, batch_tags, check_packet_compatibility, @@ -16,7 +16,7 @@ merge_typespecs, ) -from orcabridge.types import Packet, Tag, TypeSpec +from orcapod.types import Packet, Tag, TypeSpec class Repeat(Operator): diff --git a/src/orcabridge/core/sources.py b/src/orcapod/core/sources.py similarity index 96% rename from src/orcabridge/core/sources.py rename to src/orcapod/core/sources.py index 235372c..33df20d 100644 --- a/src/orcabridge/core/sources.py +++ b/src/orcapod/core/sources.py @@ -3,10 +3,10 @@ from pathlib import Path from typing import Any, Literal -from orcabridge.core.base import Source -from orcabridge.hashing import hash_function -from orcabridge.core.streams import SyncStream, SyncStreamFromGenerator -from orcabridge.types import Packet, Tag +from orcapod.core.base import Source +from orcapod.hashing import hash_function +from orcapod.core.streams import SyncStream, SyncStreamFromGenerator +from orcapod.types import Packet, Tag class GlobSource(Source): diff --git a/src/orcabridge/core/streams.py b/src/orcapod/core/streams.py similarity index 96% rename from src/orcabridge/core/streams.py rename to src/orcapod/core/streams.py index 4f4f3c3..77cdbe3 100644 --- a/src/orcabridge/core/streams.py +++ b/src/orcapod/core/streams.py @@ -1,7 +1,7 @@ from collections.abc import Callable, Collection, Iterator -from orcabridge.core.base import SyncStream -from orcabridge.types import Packet, Tag +from orcapod.core.base import SyncStream +from orcapod.types import Packet, Tag class SyncStreamFromLists(SyncStream): diff --git a/src/orcabridge/core/tracker.py b/src/orcapod/core/tracker.py similarity index 97% rename from src/orcabridge/core/tracker.py rename to src/orcapod/core/tracker.py index 6e3afa9..efc2c42 100644 --- a/src/orcabridge/core/tracker.py +++ b/src/orcapod/core/tracker.py @@ -1,4 +1,4 @@ -from orcabridge.core.base import Invocation, Kernel, Tracker +from orcapod.core.base import Invocation, Kernel, Tracker class GraphTracker(Tracker): diff --git a/src/orcabridge/dj/__init__.py b/src/orcapod/dj/__init__.py similarity index 100% rename from src/orcabridge/dj/__init__.py rename to src/orcapod/dj/__init__.py diff --git a/src/orcabridge/dj/mapper.py b/src/orcapod/dj/mapper.py similarity index 98% rename from src/orcabridge/dj/mapper.py rename to src/orcapod/dj/mapper.py index d3f2d69..a38fdaf 100644 --- a/src/orcabridge/dj/mapper.py +++ b/src/orcapod/dj/mapper.py @@ -1,7 +1,7 @@ import warnings from typing import Optional -from orcabridge.mappers import Join, MapPackets, Mapper, MapTags +from orcapod.mappers import Join, MapPackets, Mapper, MapTags from .operation import QueryOperation from .stream import QueryStream diff --git a/src/orcabridge/dj/operation.py b/src/orcapod/dj/operation.py similarity index 100% rename from src/orcabridge/dj/operation.py rename to src/orcapod/dj/operation.py diff --git a/src/orcabridge/dj/pod.py b/src/orcapod/dj/pod.py similarity index 100% rename from src/orcabridge/dj/pod.py rename to src/orcapod/dj/pod.py diff --git a/src/orcabridge/dj/source.py b/src/orcapod/dj/source.py similarity index 99% rename from src/orcabridge/dj/source.py rename to src/orcapod/dj/source.py index cbcf0d7..8af3f23 100644 --- a/src/orcabridge/dj/source.py +++ b/src/orcapod/dj/source.py @@ -4,10 +4,10 @@ import datajoint as dj from datajoint import Schema, Table -from orcabridge.hashing import hash_to_uuid +from orcapod.hashing import hash_to_uuid -from orcabridge.sources import Source -from orcabridge.streams import SyncStream +from orcapod.sources import Source +from orcapod.streams import SyncStream from ..utils.name import pascal_to_snake, snake_to_pascal from ..utils.stream_utils import common_elements from .operation import QueryOperation diff --git a/src/orcabridge/dj/stream.py b/src/orcapod/dj/stream.py similarity index 99% rename from src/orcabridge/dj/stream.py rename to src/orcapod/dj/stream.py index c8677f5..3e4eb08 100644 --- a/src/orcabridge/dj/stream.py +++ b/src/orcapod/dj/stream.py @@ -5,7 +5,7 @@ from datajoint.expression import QueryExpression from datajoint.table import Table -from orcabridge.streams import SyncStream +from orcapod.streams import SyncStream logger = logging.getLogger(__name__) diff --git a/src/orcabridge/dj/tracker.py b/src/orcapod/dj/tracker.py similarity index 96% rename from src/orcabridge/dj/tracker.py rename to src/orcapod/dj/tracker.py index 4e92273..b137e54 100644 --- a/src/orcabridge/dj/tracker.py +++ b/src/orcapod/dj/tracker.py @@ -6,10 +6,10 @@ import networkx as nx from datajoint import Schema -from orcabridge.base import Operation, Source -from orcabridge.mappers import Mapper, Merge -from orcabridge.pod import FunctionPod -from orcabridge.pipeline import GraphTracker +from orcapod.base import Operation, Source +from orcapod.mappers import Mapper, Merge +from orcapod.pod import FunctionPod +from orcapod.pipeline import GraphTracker from .mapper import convert_to_query_mapper from .operation import QueryOperation diff --git a/src/orcabridge/hashing/__init__.py b/src/orcapod/hashing/__init__.py similarity index 100% rename from src/orcabridge/hashing/__init__.py rename to src/orcapod/hashing/__init__.py diff --git a/src/orcabridge/hashing/core.py b/src/orcapod/hashing/core.py similarity index 99% rename from src/orcabridge/hashing/core.py rename to src/orcapod/hashing/core.py index dcd7f79..c711f63 100644 --- a/src/orcabridge/hashing/core.py +++ b/src/orcapod/hashing/core.py @@ -31,8 +31,8 @@ import xxhash -from orcabridge.types import Packet, PathSet -from orcabridge.utils.name import find_noncolliding_name +from orcapod.types import Packet, PathSet +from orcapod.utils.name import find_noncolliding_name # Configure logging with __name__ for proper hierarchy logger = logging.getLogger(__name__) diff --git a/src/orcabridge/hashing/defaults.py b/src/orcapod/hashing/defaults.py similarity index 72% rename from src/orcabridge/hashing/defaults.py rename to src/orcapod/hashing/defaults.py index 5a5d587..85e1405 100644 --- a/src/orcabridge/hashing/defaults.py +++ b/src/orcapod/hashing/defaults.py @@ -1,12 +1,12 @@ # A collection of utility function that provides a "default" implementation of hashers. # This is often used as the fallback hasher in the library code. -from orcabridge.hashing.types import CompositeFileHasher, ArrowHasher -from orcabridge.hashing.file_hashers import PathLikeHasherFactory -from orcabridge.hashing.string_cachers import InMemoryCacher -from orcabridge.hashing.object_hashers import ObjectHasher -from orcabridge.hashing.object_hashers import LegacyObjectHasher -from orcabridge.hashing.function_info_extractors import FunctionInfoExtractorFactory -from orcabridge.hashing.semantic_arrow_hasher import SemanticArrowHasher, PathHasher +from orcapod.hashing.types import CompositeFileHasher, ArrowHasher +from orcapod.hashing.file_hashers import PathLikeHasherFactory +from orcapod.hashing.string_cachers import InMemoryCacher +from orcapod.hashing.object_hashers import ObjectHasher +from orcapod.hashing.object_hashers import LegacyObjectHasher +from orcapod.hashing.function_info_extractors import FunctionInfoExtractorFactory +from orcapod.hashing.semantic_arrow_hasher import SemanticArrowHasher, PathHasher def get_default_composite_file_hasher(with_cache=True) -> CompositeFileHasher: diff --git a/src/orcabridge/hashing/file_hashers.py b/src/orcapod/hashing/file_hashers.py similarity index 96% rename from src/orcabridge/hashing/file_hashers.py rename to src/orcapod/hashing/file_hashers.py index 5c66581..77833ee 100644 --- a/src/orcabridge/hashing/file_hashers.py +++ b/src/orcapod/hashing/file_hashers.py @@ -1,11 +1,11 @@ -from orcabridge.hashing.core import hash_file, hash_pathset, hash_packet -from orcabridge.hashing.types import ( +from orcapod.hashing.core import hash_file, hash_pathset, hash_packet +from orcapod.hashing.types import ( FileHasher, PathSetHasher, StringCacher, CompositeFileHasher, ) -from orcabridge.types import Packet, PathLike, PathSet +from orcapod.types import Packet, PathLike, PathSet # Completely unnecessary to inherit from FileHasher, but this diff --git a/src/orcabridge/hashing/function_info_extractors.py b/src/orcapod/hashing/function_info_extractors.py similarity index 98% rename from src/orcabridge/hashing/function_info_extractors.py rename to src/orcapod/hashing/function_info_extractors.py index 4f9bb58..2c32f05 100644 --- a/src/orcabridge/hashing/function_info_extractors.py +++ b/src/orcapod/hashing/function_info_extractors.py @@ -1,7 +1,7 @@ from .types import FunctionInfoExtractor from collections.abc import Callable from typing import Any, Literal -from orcabridge.types import TypeSpec +from orcapod.types import TypeSpec import inspect diff --git a/src/orcabridge/hashing/hashing_legacy.py b/src/orcapod/hashing/hashing_legacy.py similarity index 100% rename from src/orcabridge/hashing/hashing_legacy.py rename to src/orcapod/hashing/hashing_legacy.py diff --git a/src/orcabridge/hashing/object_hashers.py b/src/orcapod/hashing/object_hashers.py similarity index 100% rename from src/orcabridge/hashing/object_hashers.py rename to src/orcapod/hashing/object_hashers.py diff --git a/src/orcabridge/hashing/semantic_arrow_hasher.py b/src/orcapod/hashing/semantic_arrow_hasher.py similarity index 100% rename from src/orcabridge/hashing/semantic_arrow_hasher.py rename to src/orcapod/hashing/semantic_arrow_hasher.py diff --git a/src/orcabridge/hashing/string_cachers.py b/src/orcapod/hashing/string_cachers.py similarity index 99% rename from src/orcabridge/hashing/string_cachers.py rename to src/orcapod/hashing/string_cachers.py index 0db7af4..9b2244a 100644 --- a/src/orcabridge/hashing/string_cachers.py +++ b/src/orcapod/hashing/string_cachers.py @@ -6,7 +6,7 @@ from pathlib import Path from typing import TYPE_CHECKING, Any, TYPE_CHECKING -from orcabridge.hashing.types import StringCacher +from orcapod.hashing.types import StringCacher logger = logging.getLogger(__name__) diff --git a/src/orcabridge/hashing/types.py b/src/orcapod/hashing/types.py similarity index 98% rename from src/orcabridge/hashing/types.py rename to src/orcapod/hashing/types.py index b986941..5e8b07c 100644 --- a/src/orcabridge/hashing/types.py +++ b/src/orcapod/hashing/types.py @@ -5,7 +5,7 @@ from typing import Any, Protocol, runtime_checkable import uuid -from orcabridge.types import Packet, PathLike, PathSet, TypeSpec +from orcapod.types import Packet, PathLike, PathSet, TypeSpec import pyarrow as pa diff --git a/src/orcabridge/pipeline/pipeline.py b/src/orcapod/pipeline/pipeline.py similarity index 99% rename from src/orcabridge/pipeline/pipeline.py rename to src/orcapod/pipeline/pipeline.py index f203bdb..f160f2b 100644 --- a/src/orcabridge/pipeline/pipeline.py +++ b/src/orcapod/pipeline/pipeline.py @@ -10,9 +10,9 @@ import networkx as nx import pandas as pd -from orcabridge.core.base import Invocation, Kernel -from orcabridge.hashing import hash_to_hex -from orcabridge.core.tracker import GraphTracker +from orcapod.core.base import Invocation, Kernel +from orcapod.hashing import hash_to_hex +from orcapod.core.tracker import GraphTracker logger = logging.getLogger(__name__) @@ -713,7 +713,7 @@ def validate_pipeline_serializability(pipeline: Pipeline) -> None: def create_example_pipeline() -> Pipeline: """Create an example pipeline for testing""" - from orcabridge import GlobSource, function_pod + from orcapod import GlobSource, function_pod @function_pod def example_function(input_file): diff --git a/src/orcabridge/pod/__init__.py b/src/orcapod/pod/__init__.py similarity index 100% rename from src/orcabridge/pod/__init__.py rename to src/orcapod/pod/__init__.py diff --git a/src/orcabridge/pod/core.py b/src/orcapod/pod/core.py similarity index 98% rename from src/orcabridge/pod/core.py rename to src/orcapod/pod/core.py index 364bc90..a82944f 100644 --- a/src/orcabridge/pod/core.py +++ b/src/orcapod/pod/core.py @@ -11,10 +11,10 @@ Literal, ) -from orcabridge.types.registry import PacketConverter +from orcapod.types.registry import PacketConverter -from orcabridge.core.base import Kernel -from orcabridge.hashing import ( +from orcapod.core.base import Kernel +from orcapod.hashing import ( ObjectHasher, ArrowHasher, FunctionInfoExtractor, @@ -23,18 +23,18 @@ get_default_object_hasher, get_default_arrow_hasher, ) -from orcabridge.core.operators import Join -from orcabridge.store import DataStore, ArrowDataStore, NoOpDataStore -from orcabridge.core.streams import SyncStream, SyncStreamFromGenerator -from orcabridge.types import Packet, PathSet, PodFunction, Tag, TypeSpec +from orcapod.core.operators import Join +from orcapod.store import DataStore, ArrowDataStore, NoOpDataStore +from orcapod.core.streams import SyncStream, SyncStreamFromGenerator +from orcapod.types import Packet, PathSet, PodFunction, Tag, TypeSpec -from orcabridge.types.default import default_registry -from orcabridge.types.inference import ( +from orcapod.types.default import default_registry +from orcapod.types.inference import ( extract_function_data_types, verify_against_typespec, check_typespec_compatibility, ) -from orcabridge.types.registry import is_packet_supported +from orcapod.types.registry import is_packet_supported import polars as pl logger = logging.getLogger(__name__) diff --git a/src/orcabridge/py.typed b/src/orcapod/py.typed similarity index 100% rename from src/orcabridge/py.typed rename to src/orcapod/py.typed diff --git a/src/orcabridge/store/__init__.py b/src/orcapod/store/__init__.py similarity index 100% rename from src/orcabridge/store/__init__.py rename to src/orcapod/store/__init__.py diff --git a/src/orcabridge/store/arrow_data_stores.py b/src/orcapod/store/arrow_data_stores.py similarity index 99% rename from src/orcabridge/store/arrow_data_stores.py rename to src/orcapod/store/arrow_data_stores.py index 1e866d5..4be9698 100644 --- a/src/orcabridge/store/arrow_data_stores.py +++ b/src/orcapod/store/arrow_data_stores.py @@ -7,7 +7,7 @@ from dataclasses import dataclass from datetime import datetime, timedelta import logging -from orcabridge.store.types import DuplicateError +from orcapod.store.types import DuplicateError # Module-level logger logger = logging.getLogger(__name__) diff --git a/src/orcabridge/store/core.py b/src/orcapod/store/core.py similarity index 97% rename from src/orcabridge/store/core.py rename to src/orcapod/store/core.py index 428819a..c41dd55 100644 --- a/src/orcabridge/store/core.py +++ b/src/orcapod/store/core.py @@ -4,11 +4,11 @@ from os import PathLike from pathlib import Path -from orcabridge.hashing import hash_packet -from orcabridge.hashing.defaults import get_default_composite_file_hasher -from orcabridge.hashing.types import PacketHasher -from orcabridge.store.types import DataStore -from orcabridge.types import Packet +from orcapod.hashing import hash_packet +from orcapod.hashing.defaults import get_default_composite_file_hasher +from orcapod.hashing.types import PacketHasher +from orcapod.store.types import DataStore +from orcapod.types import Packet logger = logging.getLogger(__name__) diff --git a/src/orcabridge/store/file.py b/src/orcapod/store/file.py similarity index 99% rename from src/orcabridge/store/file.py rename to src/orcapod/store/file.py index 91961fb..0de8aff 100644 --- a/src/orcabridge/store/file.py +++ b/src/orcapod/store/file.py @@ -5,7 +5,7 @@ from pathlib import Path from typing import Callable, Collection, Dict, Optional, Tuple, Union -from orcabridge.types import Packet, PathSet +from orcapod.types import Packet, PathSet @contextlib.contextmanager diff --git a/src/orcabridge/store/file_ops.py b/src/orcapod/store/file_ops.py similarity index 99% rename from src/orcabridge/store/file_ops.py rename to src/orcapod/store/file_ops.py index 33675a0..0e34213 100644 --- a/src/orcabridge/store/file_ops.py +++ b/src/orcapod/store/file_ops.py @@ -4,7 +4,7 @@ import os from pathlib import Path -from orcabridge.types import PathLike +from orcapod.types import PathLike logger = logging.getLogger(__name__) diff --git a/src/orcabridge/store/safe_dir_data_store.py b/src/orcapod/store/safe_dir_data_store.py similarity index 99% rename from src/orcabridge/store/safe_dir_data_store.py rename to src/orcapod/store/safe_dir_data_store.py index 548039f..0f0ce6a 100644 --- a/src/orcabridge/store/safe_dir_data_store.py +++ b/src/orcapod/store/safe_dir_data_store.py @@ -205,7 +205,7 @@ def __init__( def _get_output_dir(self, function_name, content_hash, packet): """Get the output directory for a specific packet""" - from orcabridge.hashing.core import hash_dict + from orcapod.hashing.core import hash_dict packet_hash = hash_dict(packet) return self.store_dir / function_name / content_hash / str(packet_hash) diff --git a/src/orcabridge/store/transfer.py b/src/orcapod/store/transfer.py similarity index 96% rename from src/orcabridge/store/transfer.py rename to src/orcapod/store/transfer.py index c4757ef..c9a4e5d 100644 --- a/src/orcabridge/store/transfer.py +++ b/src/orcapod/store/transfer.py @@ -1,7 +1,7 @@ # Implements transfer data store that lets you transfer memoized packets between data stores. -from orcabridge.store.types import DataStore -from orcabridge.types import Packet +from orcapod.store.types import DataStore +from orcapod.types import Packet class TransferDataStore(DataStore): diff --git a/src/orcabridge/store/types.py b/src/orcapod/store/types.py similarity index 98% rename from src/orcabridge/store/types.py rename to src/orcapod/store/types.py index 444149d..6c1b5af 100644 --- a/src/orcabridge/store/types.py +++ b/src/orcapod/store/types.py @@ -1,6 +1,6 @@ from typing import Protocol, runtime_checkable -from orcabridge.types import Tag, Packet +from orcapod.types import Tag, Packet import pyarrow as pa import polars as pl diff --git a/src/orcabridge/types/__init__.py b/src/orcapod/types/__init__.py similarity index 100% rename from src/orcabridge/types/__init__.py rename to src/orcapod/types/__init__.py diff --git a/src/orcabridge/types/core.py b/src/orcapod/types/core.py similarity index 100% rename from src/orcabridge/types/core.py rename to src/orcapod/types/core.py diff --git a/src/orcabridge/types/default.py b/src/orcapod/types/default.py similarity index 100% rename from src/orcabridge/types/default.py rename to src/orcapod/types/default.py diff --git a/src/orcabridge/types/handlers.py b/src/orcapod/types/handlers.py similarity index 100% rename from src/orcabridge/types/handlers.py rename to src/orcapod/types/handlers.py diff --git a/src/orcabridge/types/inference.py b/src/orcapod/types/inference.py similarity index 100% rename from src/orcabridge/types/inference.py rename to src/orcapod/types/inference.py diff --git a/src/orcabridge/types/registry.py b/src/orcapod/types/registry.py similarity index 99% rename from src/orcabridge/types/registry.py rename to src/orcapod/types/registry.py index 2870b1c..0dafda5 100644 --- a/src/orcabridge/types/registry.py +++ b/src/orcapod/types/registry.py @@ -3,7 +3,7 @@ from optparse import Values from typing import Any import pyarrow as pa -from orcabridge.types import Packet +from orcapod.types import Packet from .core import TypeHandler, TypeInfo, TypeSpec # This mapping is expected to be stable diff --git a/src/orcabridge/types/utils.py b/src/orcapod/types/utils.py similarity index 100% rename from src/orcabridge/types/utils.py rename to src/orcapod/types/utils.py diff --git a/src/orcabridge/utils/__init__.py b/src/orcapod/utils/__init__.py similarity index 100% rename from src/orcabridge/utils/__init__.py rename to src/orcapod/utils/__init__.py diff --git a/src/orcabridge/utils/name.py b/src/orcapod/utils/name.py similarity index 100% rename from src/orcabridge/utils/name.py rename to src/orcapod/utils/name.py diff --git a/src/orcabridge/utils/stream_utils.py b/src/orcapod/utils/stream_utils.py similarity index 98% rename from src/orcabridge/utils/stream_utils.py rename to src/orcapod/utils/stream_utils.py index a762b06..51d46c1 100644 --- a/src/orcabridge/utils/stream_utils.py +++ b/src/orcapod/utils/stream_utils.py @@ -5,7 +5,7 @@ from collections.abc import Collection, Mapping from typing import TypeVar, Hashable, Any -from orcabridge.types import Packet, Tag, TypeSpec +from orcapod.types import Packet, Tag, TypeSpec K = TypeVar("K", bound=Hashable) diff --git a/tests/test_hashing/generate_file_hashes.py b/tests/test_hashing/generate_file_hashes.py index a2fe385..1002b7f 100644 --- a/tests/test_hashing/generate_file_hashes.py +++ b/tests/test_hashing/generate_file_hashes.py @@ -16,7 +16,7 @@ # Add the parent directory to the path to import orcabridge sys.path.append(str(Path(__file__).parent.parent.parent)) -from orcabridge.hashing import hash_file +from orcapod.hashing import hash_file # Create directories if they don't exist HASH_SAMPLES_DIR = Path(__file__).parent / "hash_samples" diff --git a/tests/test_hashing/generate_hash_examples.py b/tests/test_hashing/generate_hash_examples.py index cbba97b..3f83ef5 100644 --- a/tests/test_hashing/generate_hash_examples.py +++ b/tests/test_hashing/generate_hash_examples.py @@ -8,7 +8,7 @@ from datetime import datetime from pathlib import Path -from orcabridge.hashing import hash_to_hex, hash_to_int, hash_to_uuid +from orcapod.hashing import hash_to_hex, hash_to_int, hash_to_uuid # Create the hash_samples directory if it doesn't exist SAMPLES_DIR = Path(__file__).parent / "hash_samples" diff --git a/tests/test_hashing/generate_pathset_packet_hashes.py b/tests/test_hashing/generate_pathset_packet_hashes.py index 6314e66..61a36eb 100644 --- a/tests/test_hashing/generate_pathset_packet_hashes.py +++ b/tests/test_hashing/generate_pathset_packet_hashes.py @@ -13,7 +13,7 @@ # Add the parent directory to the path to import orcabridge sys.path.append(str(Path(__file__).parent.parent.parent)) -from orcabridge.hashing import hash_packet, hash_pathset +from orcapod.hashing import hash_packet, hash_pathset # Create directories if they don't exist HASH_SAMPLES_DIR = Path(__file__).parent / "hash_samples" diff --git a/tests/test_hashing/test_basic_composite_hasher.py b/tests/test_hashing/test_basic_composite_hasher.py index fc82402..d2c5361 100644 --- a/tests/test_hashing/test_basic_composite_hasher.py +++ b/tests/test_hashing/test_basic_composite_hasher.py @@ -13,7 +13,7 @@ import pytest -from orcabridge.hashing.file_hashers import PathLikeHasherFactory +from orcapod.hashing.file_hashers import PathLikeHasherFactory def load_hash_lut(): diff --git a/tests/test_hashing/test_basic_hashing.py b/tests/test_hashing/test_basic_hashing.py index 5ab355f..df90a1a 100644 --- a/tests/test_hashing/test_basic_hashing.py +++ b/tests/test_hashing/test_basic_hashing.py @@ -1,4 +1,4 @@ -from orcabridge.hashing.core import ( +from orcapod.hashing.core import ( HashableMixin, hash_to_hex, hash_to_int, diff --git a/tests/test_hashing/test_cached_file_hasher.py b/tests/test_hashing/test_cached_file_hasher.py index e8c3199..3307628 100644 --- a/tests/test_hashing/test_cached_file_hasher.py +++ b/tests/test_hashing/test_cached_file_hasher.py @@ -10,12 +10,12 @@ import pytest -from orcabridge.hashing.file_hashers import ( +from orcapod.hashing.file_hashers import ( BasicFileHasher, CachedFileHasher, ) -from orcabridge.hashing.string_cachers import InMemoryCacher -from orcabridge.hashing.types import FileHasher, StringCacher +from orcapod.hashing.string_cachers import InMemoryCacher +from orcapod.hashing.types import FileHasher, StringCacher def verify_path_exists(rel_path): diff --git a/tests/test_hashing/test_composite_hasher.py b/tests/test_hashing/test_composite_hasher.py index d3aa278..1cbe386 100644 --- a/tests/test_hashing/test_composite_hasher.py +++ b/tests/test_hashing/test_composite_hasher.py @@ -1,14 +1,13 @@ #!/usr/bin/env python -# filepath: /home/eywalker/workspace/orcabridge/tests/test_hashing/test_composite_hasher.py """Tests for the CompositeFileHasher implementation.""" from unittest.mock import patch import pytest -from orcabridge.hashing.core import hash_to_hex -from orcabridge.hashing.file_hashers import BasicFileHasher, DefaultCompositeFileHasher -from orcabridge.hashing.types import FileHasher, PacketHasher, PathSetHasher +from orcapod.hashing.core import hash_to_hex +from orcapod.hashing.file_hashers import BasicFileHasher, DefaultCompositeFileHasher +from orcapod.hashing.types import FileHasher, PacketHasher, PathSetHasher # Custom implementation of hash_file for tests that doesn't check for file existence @@ -90,9 +89,9 @@ def mock_hash_packet( def patch_hash_functions(): """Patch the hash functions in the core module for all tests.""" with ( - patch("orcabridge.hashing.core.hash_file", side_effect=mock_hash_file), - patch("orcabridge.hashing.core.hash_pathset", side_effect=mock_hash_pathset), - patch("orcabridge.hashing.core.hash_packet", side_effect=mock_hash_packet), + patch("orcapod.hashing.core.hash_file", side_effect=mock_hash_file), + patch("orcapod.hashing.core.hash_pathset", side_effect=mock_hash_pathset), + patch("orcapod.hashing.core.hash_packet", side_effect=mock_hash_packet), ): yield diff --git a/tests/test_hashing/test_file_hashes.py b/tests/test_hashing/test_file_hashes.py index 70ff814..66ed987 100644 --- a/tests/test_hashing/test_file_hashes.py +++ b/tests/test_hashing/test_file_hashes.py @@ -13,7 +13,7 @@ import pytest # Add the parent directory to the path to import orcabridge -from orcabridge.hashing import hash_file +from orcapod.hashing import hash_file def load_hash_lut(): diff --git a/tests/test_hashing/test_hash_samples.py b/tests/test_hashing/test_hash_samples.py index 54fa32f..cfb3e35 100644 --- a/tests/test_hashing/test_hash_samples.py +++ b/tests/test_hashing/test_hash_samples.py @@ -12,7 +12,7 @@ import pytest -from orcabridge.hashing import hash_to_hex, hash_to_int, hash_to_uuid +from orcapod.hashing import hash_to_hex, hash_to_int, hash_to_uuid def get_latest_hash_samples(): diff --git a/tests/test_hashing/test_hasher_factory.py b/tests/test_hashing/test_hasher_factory.py index 6e80827..afd2392 100644 --- a/tests/test_hashing/test_hasher_factory.py +++ b/tests/test_hashing/test_hasher_factory.py @@ -4,12 +4,12 @@ import tempfile from pathlib import Path -from orcabridge.hashing.file_hashers import ( +from orcapod.hashing.file_hashers import ( BasicFileHasher, CachedFileHasher, PathLikeHasherFactory, ) -from orcabridge.hashing.string_cachers import FileCacher, InMemoryCacher +from orcapod.hashing.string_cachers import FileCacher, InMemoryCacher class TestPathLikeHasherFactoryCreateFileHasher: diff --git a/tests/test_hashing/test_hasher_parity.py b/tests/test_hashing/test_hasher_parity.py index 36d0a65..fb83afb 100644 --- a/tests/test_hashing/test_hasher_parity.py +++ b/tests/test_hashing/test_hasher_parity.py @@ -14,8 +14,8 @@ import pytest -from orcabridge.hashing.core import hash_file, hash_packet, hash_pathset -from orcabridge.hashing.file_hashers import PathLikeHasherFactory +from orcapod.hashing.core import hash_file, hash_packet, hash_pathset +from orcapod.hashing.file_hashers import PathLikeHasherFactory def load_hash_lut(): diff --git a/tests/test_hashing/test_packet_hasher.py b/tests/test_hashing/test_packet_hasher.py index 001f693..f9d519d 100644 --- a/tests/test_hashing/test_packet_hasher.py +++ b/tests/test_hashing/test_packet_hasher.py @@ -4,8 +4,8 @@ import pytest -from orcabridge.hashing.file_hashers import DefaultPacketHasher -from orcabridge.hashing.types import PathSetHasher +from orcapod.hashing.file_hashers import DefaultPacketHasher +from orcapod.hashing.types import PathSetHasher class MockPathSetHasher(PathSetHasher): diff --git a/tests/test_hashing/test_path_set_hasher.py b/tests/test_hashing/test_path_set_hasher.py index ed75b3d..999cc2a 100644 --- a/tests/test_hashing/test_path_set_hasher.py +++ b/tests/test_hashing/test_path_set_hasher.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# filepath: /home/eywalker/workspace/orcabridge/tests/test_hashing/test_path_set_hasher.py """Tests for the PathSetHasher protocol implementation.""" import os @@ -9,9 +8,9 @@ import pytest -import orcabridge.hashing.core -from orcabridge.hashing.file_hashers import DefaultPathsetHasher -from orcabridge.hashing.types import FileHasher +import orcapod.hashing.core +from orcapod.hashing.file_hashers import DefaultPathsetHasher +from orcapod.hashing.types import FileHasher class MockFileHasher(FileHasher): @@ -36,7 +35,7 @@ def create_temp_file(content="test content"): # Store original function for restoration -original_hash_pathset = orcabridge.hashing.core.hash_pathset +original_hash_pathset = orcapod.hashing.core.hash_pathset # Custom implementation of hash_pathset for tests that doesn't check for file existence @@ -47,8 +46,8 @@ def mock_hash_pathset( from collections.abc import Collection from os import PathLike - from orcabridge.hashing.core import hash_to_hex - from orcabridge.utils.name import find_noncolliding_name + from orcapod.hashing.core import hash_to_hex + from orcapod.utils.name import find_noncolliding_name # If file_hasher is None, we'll need to handle it differently if file_hasher is None: @@ -87,7 +86,7 @@ def mock_hash_pathset( @pytest.fixture(autouse=True) def patch_hash_pathset(): """Patch the hash_pathset function in the hashing module for all tests.""" - with patch("orcabridge.hashing.core.hash_pathset", side_effect=mock_hash_pathset): + with patch("orcapod.hashing.core.hash_pathset", side_effect=mock_hash_pathset): yield @@ -226,7 +225,7 @@ def custom_hash_nonexistent(pathset, **kwargs): # Patch hash_pathset just for this test with patch( - "orcabridge.hashing.core.hash_pathset", side_effect=custom_hash_nonexistent + "orcapod.hashing.core.hash_pathset", side_effect=custom_hash_nonexistent ): result = pathset_hasher.hash_pathset(pathset) diff --git a/tests/test_hashing/test_pathset_and_packet.py b/tests/test_hashing/test_pathset_and_packet.py index 91efbc7..6b7eb6f 100644 --- a/tests/test_hashing/test_pathset_and_packet.py +++ b/tests/test_hashing/test_pathset_and_packet.py @@ -14,7 +14,7 @@ import pytest -from orcabridge.hashing import hash_file, hash_packet, hash_pathset +from orcapod.hashing import hash_file, hash_packet, hash_pathset logger = logging.getLogger(__name__) diff --git a/tests/test_hashing/test_pathset_packet_hashes.py b/tests/test_hashing/test_pathset_packet_hashes.py index 9f31f00..49e2d0c 100644 --- a/tests/test_hashing/test_pathset_packet_hashes.py +++ b/tests/test_hashing/test_pathset_packet_hashes.py @@ -13,7 +13,7 @@ import pytest # Add the parent directory to the path to import orcabridge -from orcabridge.hashing import hash_packet, hash_pathset +from orcapod.hashing import hash_packet, hash_pathset def load_pathset_hash_lut(): diff --git a/tests/test_hashing/test_process_structure.py b/tests/test_hashing/test_process_structure.py index 24b3b08..933e2dc 100644 --- a/tests/test_hashing/test_process_structure.py +++ b/tests/test_hashing/test_process_structure.py @@ -3,7 +3,7 @@ from pathlib import Path from typing import Any -from orcabridge.hashing.core import HashableMixin, hash_to_hex, process_structure +from orcapod.hashing.core import HashableMixin, hash_to_hex, process_structure # Define a simple HashableMixin class for testing diff --git a/tests/test_hashing/test_sqlite_cacher.py b/tests/test_hashing/test_sqlite_cacher.py index 99a8030..6018b30 100644 --- a/tests/test_hashing/test_sqlite_cacher.py +++ b/tests/test_hashing/test_sqlite_cacher.py @@ -7,7 +7,7 @@ from pathlib import Path from unittest.mock import MagicMock, patch -from orcabridge.hashing.string_cachers import SQLiteCacher +from orcapod.hashing.string_cachers import SQLiteCacher def test_basic_operations(): diff --git a/tests/test_hashing/test_string_cacher/test_file_cacher.py b/tests/test_hashing/test_string_cacher/test_file_cacher.py index 223fcf8..e6104d3 100644 --- a/tests/test_hashing/test_string_cacher/test_file_cacher.py +++ b/tests/test_hashing/test_string_cacher/test_file_cacher.py @@ -6,7 +6,7 @@ from pathlib import Path from unittest.mock import mock_open, patch -from orcabridge.hashing.string_cachers import FileCacher +from orcapod.hashing.string_cachers import FileCacher def test_basic_operations(): diff --git a/tests/test_hashing/test_string_cacher/test_in_memory_cacher.py b/tests/test_hashing/test_string_cacher/test_in_memory_cacher.py index 5e76e44..ce2a2d4 100644 --- a/tests/test_hashing/test_string_cacher/test_in_memory_cacher.py +++ b/tests/test_hashing/test_string_cacher/test_in_memory_cacher.py @@ -3,7 +3,7 @@ import threading import time -from orcabridge.hashing.string_cachers import InMemoryCacher +from orcapod.hashing.string_cachers import InMemoryCacher def test_basic_operations(): diff --git a/tests/test_hashing/test_string_cacher/test_redis_cacher.py b/tests/test_hashing/test_string_cacher/test_redis_cacher.py index 060fb61..3ef49e1 100644 --- a/tests/test_hashing/test_string_cacher/test_redis_cacher.py +++ b/tests/test_hashing/test_string_cacher/test_redis_cacher.py @@ -5,7 +5,7 @@ import pytest -from orcabridge.hashing.string_cachers import RedisCacher +from orcapod.hashing.string_cachers import RedisCacher if TYPE_CHECKING: import redis @@ -86,7 +86,7 @@ def mock_no_redis(): class TestRedisCacher: """Test cases for RedisCacher with mocked Redis.""" - @patch("orcabridge.hashing.string_cachers._get_redis", mock_get_redis) + @patch("orcapod.hashing.string_cachers._get_redis", mock_get_redis) def test_basic_operations(self): """Test basic get/set/clear operations.""" mock_redis = MockRedis() @@ -113,7 +113,7 @@ def test_basic_operations(self): assert cacher.get_cached("key1") is None assert cacher.get_cached("key2") is None - @patch("orcabridge.hashing.string_cachers._get_redis", mock_get_redis) + @patch("orcapod.hashing.string_cachers._get_redis", mock_get_redis) def test_key_prefixing(self): """Test that keys are properly prefixed.""" mock_redis = MockRedis() @@ -128,7 +128,7 @@ def test_key_prefixing(self): # But retrieval should work without prefix assert cacher.get_cached("key1") == "value1" - @patch("orcabridge.hashing.string_cachers._get_redis", mock_get_redis) + @patch("orcapod.hashing.string_cachers._get_redis", mock_get_redis) def test_connection_initialization_success(self): """Test successful connection initialization.""" mock_redis = MockRedis() @@ -143,7 +143,7 @@ def test_connection_initialization_success(self): assert mock_redis.ping_called assert cacher.is_connected() - @patch("orcabridge.hashing.string_cachers._get_redis", mock_get_redis) + @patch("orcapod.hashing.string_cachers._get_redis", mock_get_redis) def test_connection_initialization_failure(self): """Test connection initialization failure.""" mock_redis = MockRedis(fail_connection=True) @@ -151,7 +151,7 @@ def test_connection_initialization_failure(self): with pytest.raises(RuntimeError, match="Redis connection test failed"): RedisCacher(connection=mock_redis, key_prefix="test:") - @patch("orcabridge.hashing.string_cachers._get_redis", mock_get_redis) + @patch("orcapod.hashing.string_cachers._get_redis", mock_get_redis) def test_new_connection_creation(self): """Test creation of new Redis connection when none provided.""" cacher = RedisCacher(host="localhost", port=6379, db=0, key_prefix="test:") @@ -171,7 +171,7 @@ def test_new_connection_creation(self): assert cacher.is_connected() - @patch("orcabridge.hashing.string_cachers._get_redis", mock_get_redis) + @patch("orcapod.hashing.string_cachers._get_redis", mock_get_redis) def test_graceful_failure_on_operations(self): """Test graceful failure when Redis operations fail during use.""" mock_redis = MockRedis() @@ -193,7 +193,7 @@ def test_graceful_failure_on_operations(self): mock_log.assert_called_once() assert "Redis get failed" in str(mock_log.call_args) - @patch("orcabridge.hashing.string_cachers._get_redis", mock_get_redis) + @patch("orcapod.hashing.string_cachers._get_redis", mock_get_redis) def test_set_failure_handling(self): """Test handling of set operation failures.""" mock_redis = MockRedis() @@ -208,7 +208,7 @@ def test_set_failure_handling(self): assert "Redis set failed" in str(mock_log.call_args) assert not cacher.is_connected() - @patch("orcabridge.hashing.string_cachers._get_redis", mock_get_redis) + @patch("orcapod.hashing.string_cachers._get_redis", mock_get_redis) def test_clear_cache_failure_handling(self): """Test handling of clear cache operation failures.""" mock_redis = MockRedis() @@ -226,7 +226,7 @@ def test_clear_cache_failure_handling(self): assert "Redis clear failed" in str(mock_log.call_args) assert not cacher.is_connected() - @patch("orcabridge.hashing.string_cachers._get_redis", mock_get_redis) + @patch("orcapod.hashing.string_cachers._get_redis", mock_get_redis) def test_clear_cache_with_pattern_matching(self): """Test that clear_cache only removes keys with the correct prefix.""" mock_redis = MockRedis() @@ -244,7 +244,7 @@ def test_clear_cache_with_pattern_matching(self): assert "test:key2" not in mock_redis.data assert "other:key1" in mock_redis.data # Should remain - @patch("orcabridge.hashing.string_cachers._get_redis", mock_get_redis) + @patch("orcapod.hashing.string_cachers._get_redis", mock_get_redis) def test_connection_reset(self): """Test connection reset functionality.""" mock_redis = MockRedis() @@ -265,7 +265,7 @@ def test_connection_reset(self): # Check that the reset message was logged (it should be the last call) mock_log.assert_called_with("Redis connection successfully reset") - @patch("orcabridge.hashing.string_cachers._get_redis", mock_get_redis) + @patch("orcapod.hashing.string_cachers._get_redis", mock_get_redis) def test_connection_reset_failure(self): """Test connection reset failure handling.""" mock_redis = MockRedis() @@ -287,7 +287,7 @@ def test_connection_reset_failure(self): "Failed to reset Redis connection: Redis connection test failed: Connection failed" ) - @patch("orcabridge.hashing.string_cachers._get_redis", mock_get_redis) + @patch("orcapod.hashing.string_cachers._get_redis", mock_get_redis) def test_error_logging_only_once(self): """Test that errors are only logged once per failure.""" mock_redis = MockRedis() @@ -305,7 +305,7 @@ def test_error_logging_only_once(self): # Should only log the first error assert mock_log.call_count == 1 - @patch("orcabridge.hashing.string_cachers._get_redis", mock_get_redis) + @patch("orcapod.hashing.string_cachers._get_redis", mock_get_redis) def test_default_key_prefix(self): """Test default key prefix behavior.""" mock_redis = MockRedis() @@ -320,11 +320,11 @@ def test_default_key_prefix(self): def test_redis_not_available(self): """Test behavior when redis package is not available.""" - with patch("orcabridge.hashing.string_cachers._get_redis", mock_no_redis): + with patch("orcapod.hashing.string_cachers._get_redis", mock_no_redis): with pytest.raises(ImportError, match="redis package is required"): RedisCacher() - @patch("orcabridge.hashing.string_cachers._get_redis", mock_get_redis) + @patch("orcapod.hashing.string_cachers._get_redis", mock_get_redis) def test_connection_test_key_access_failure(self): """Test failure when connection test can't create/access test key.""" @@ -340,7 +340,7 @@ def get(self, key): with pytest.raises(RuntimeError, match="Redis connection test failed"): RedisCacher(connection=mock_redis, key_prefix="test:") - @patch("orcabridge.hashing.string_cachers._get_redis", mock_get_redis) + @patch("orcapod.hashing.string_cachers._get_redis", mock_get_redis) def test_thread_safety(self): """Test thread safety of Redis operations.""" import threading @@ -398,7 +398,7 @@ def worker(thread_id: int): expected = f"thread{thread_id}_value{i}" assert result == expected - @patch("orcabridge.hashing.string_cachers._get_redis", mock_get_redis) + @patch("orcapod.hashing.string_cachers._get_redis", mock_get_redis) def test_operations_after_connection_failure(self): """Test that operations return None/do nothing after connection failure.""" mock_redis = MockRedis() diff --git a/tests/test_hashing/test_string_cacher/test_sqlite_cacher.py b/tests/test_hashing/test_string_cacher/test_sqlite_cacher.py index 9204543..f51069b 100644 --- a/tests/test_hashing/test_string_cacher/test_sqlite_cacher.py +++ b/tests/test_hashing/test_string_cacher/test_sqlite_cacher.py @@ -7,7 +7,7 @@ from pathlib import Path from unittest.mock import MagicMock, patch -from orcabridge.hashing.string_cachers import SQLiteCacher +from orcapod.hashing.string_cachers import SQLiteCacher def test_basic_operations(): diff --git a/tests/test_store/test_dir_data_store.py b/tests/test_store/test_dir_data_store.py index 8856436..c07f141 100644 --- a/tests/test_store/test_dir_data_store.py +++ b/tests/test_store/test_dir_data_store.py @@ -8,13 +8,13 @@ import pytest -from orcabridge.hashing.types import ( +from orcapod.hashing.types import ( CompositeFileHasher, FileHasher, PacketHasher, PathSetHasher, ) -from orcabridge.store.core import DirDataStore +from orcapod.store.core import DirDataStore class MockFileHasher(FileHasher): @@ -500,7 +500,7 @@ def test_dir_data_store_legacy_mode_compatibility(temp_dir, sample_files): output_packet = {"output_file": sample_files["output"]["output1"]} # Get the hash values directly for comparison - from orcabridge.hashing import hash_packet + from orcapod.hashing import hash_packet legacy_hash = hash_packet(packet, algorithm="sha256") assert store_default.packet_hasher is not None, ( @@ -611,8 +611,8 @@ def test_dir_data_store_hash_equivalence(temp_dir, sample_files): output_packet = {"output_file": sample_files["output"]["output1"]} # First compute hashes directly - from orcabridge.hashing import hash_packet - from orcabridge.hashing.defaults import get_default_composite_file_hasher + from orcapod.hashing import hash_packet + from orcapod.hashing.defaults import get_default_composite_file_hasher legacy_hash = hash_packet(packet, algorithm="sha256") default_hasher = get_default_composite_file_hasher( diff --git a/tests/test_store/test_integration.py b/tests/test_store/test_integration.py index 8314362..023e6e6 100644 --- a/tests/test_store/test_integration.py +++ b/tests/test_store/test_integration.py @@ -7,13 +7,13 @@ import pytest -from orcabridge.hashing.file_hashers import ( +from orcapod.hashing.file_hashers import ( BasicFileHasher, CachedFileHasher, DefaultCompositeFileHasher, ) -from orcabridge.hashing.string_cachers import InMemoryCacher -from orcabridge.store.core import DirDataStore, NoOpDataStore +from orcapod.hashing.string_cachers import InMemoryCacher +from orcapod.store.core import DirDataStore, NoOpDataStore def test_integration_with_cached_file_hasher(temp_dir, sample_files): diff --git a/tests/test_store/test_noop_data_store.py b/tests/test_store/test_noop_data_store.py index 8f160d1..0da82c7 100644 --- a/tests/test_store/test_noop_data_store.py +++ b/tests/test_store/test_noop_data_store.py @@ -4,7 +4,7 @@ import pytest -from orcabridge.store.core import NoOpDataStore +from orcapod.store.core import NoOpDataStore def test_noop_data_store_memoize(): @@ -44,7 +44,7 @@ def test_noop_data_store_retrieve_memoized(): def test_noop_data_store_is_data_store_subclass(): """Test that NoOpDataStore is a subclass of DataStore.""" - from orcabridge.store.core import DataStore + from orcapod.store.core import DataStore store = NoOpDataStore() assert isinstance(store, DataStore) diff --git a/tests/test_store/test_transfer_data_store.py b/tests/test_store/test_transfer_data_store.py index ddb1d09..85d0a87 100644 --- a/tests/test_store/test_transfer_data_store.py +++ b/tests/test_store/test_transfer_data_store.py @@ -7,9 +7,9 @@ import pytest -from orcabridge.hashing.types import PacketHasher -from orcabridge.store.core import DirDataStore, NoOpDataStore -from orcabridge.store.transfer import TransferDataStore +from orcapod.hashing.types import PacketHasher +from orcapod.store.core import DirDataStore, NoOpDataStore +from orcapod.store.transfer import TransferDataStore class MockPacketHasher(PacketHasher): diff --git a/tests/test_types/test_inference/test_extract_function_data_types.py b/tests/test_types/test_inference/test_extract_function_data_types.py index c3426b6..a357bb0 100644 --- a/tests/test_types/test_inference/test_extract_function_data_types.py +++ b/tests/test_types/test_inference/test_extract_function_data_types.py @@ -11,7 +11,7 @@ import pytest from collections.abc import Collection -from orcabridge.types.inference import extract_function_data_types +from orcapod.types.inference import extract_function_data_types class TestExtractFunctionDataTypes: From 035c9e2b2199a982b734ef9a7017f3cb3624fa74 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Thu, 19 Jun 2025 00:34:13 +0000 Subject: [PATCH 2/4] build: update pyproject and readme --- README.md | 4 ++-- pyproject.toml | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index df97786..9641750 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ -# orcabridge -Prototype of Orcapod as implemented in Python with functions +# Orcapod Python +Orcapod's Python library for developing reproducbile scientific pipelines. ## Continuous Integration diff --git a/pyproject.toml b/pyproject.toml index 0dfedeb..ca1c20c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,8 +3,8 @@ requires = ["setuptools>=64", "wheel", "setuptools-scm>=8"] build-backend = "setuptools.build_meta" [project] -name = "orcabridge" -description = "Function-based Oracapod Pipeline implementation in Python" +name = "orcapod" +description = "simple yet powerful pipeline library for building reproducible scientific pipeline" dynamic = ["version"] dependencies = [ "xxhash", @@ -27,7 +27,7 @@ classifiers = [ ] [project.urls] -Homepage = "https://github.com/walkerlab/orcabridge" +Homepage = "https://github.com/walkerlab/orcapod-python" [project.optional-dependencies] redis = ["redis>=6.2.0"] @@ -37,7 +37,7 @@ redis = ["redis>=6.2.0"] where = ["src"] [tool.setuptools_scm] -version_file = "src/orcabridge/_version.py" +version_file = "src/orcapod/_version.py" [dependency-groups] dev = [ From d44eb93a59a7ec1ec7094070788c3db8cc9c1638 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Thu, 19 Jun 2025 00:35:55 +0000 Subject: [PATCH 3/4] doc: rename notebooks --- ...nb => 01_orcapod_core_concepts copy.ipynb} | 2 +- notebooks/02_orcapod_basic_usage copy.ipynb | 2856 +++++++++++++++++ ...age.ipynb => 02_orcapod_basic_usage.ipynb} | 0 ....ipynb => 03_orcacapod_qol_features.ipynb} | 0 ...tracker.ipynb => 04_orcapod_tracker.ipynb} | 0 5 files changed, 2857 insertions(+), 1 deletion(-) rename notebooks/{01_orcabridge_core_concepts copy.ipynb => 01_orcapod_core_concepts copy.ipynb} (94%) create mode 100644 notebooks/02_orcapod_basic_usage copy.ipynb rename notebooks/{02_orcabridge_basic_usage.ipynb => 02_orcapod_basic_usage.ipynb} (100%) rename notebooks/{03_orcabridge_qol_features.ipynb => 03_orcacapod_qol_features.ipynb} (100%) rename notebooks/{04_orcabridge_tracker.ipynb => 04_orcapod_tracker.ipynb} (100%) diff --git a/notebooks/01_orcabridge_core_concepts copy.ipynb b/notebooks/01_orcapod_core_concepts copy.ipynb similarity index 94% rename from notebooks/01_orcabridge_core_concepts copy.ipynb rename to notebooks/01_orcapod_core_concepts copy.ipynb index 590b977..d239370 100644 --- a/notebooks/01_orcabridge_core_concepts copy.ipynb +++ b/notebooks/01_orcapod_core_concepts copy.ipynb @@ -20,7 +20,7 @@ "source": [ "Below I define few critical concepts for Orcapod.\n", "\n", - "* `Data` -- In Orcapod, smallest unit of `data` is a single `file`. Unlike many other computation pipeline system, Orcapod pipeline in principle does **not** operate on `data` that's not a file. In other words, `Oracpod` pipeline will **not** pass a data in memory from one node to another. Consequently, all operations and processing in Orcapod pipeline revolves around `file` (NOTE: this is a particularly strong/restrictive version of Oracpod pipeline. We may consider extending data to things like environment variable and command line arguments)\n", + "* `Data` -- In Orcapod, the smallest unit of `data` is a single `file`. Unlike many other computation pipeline system, Orcapod pipeline in principle does **not** operate on `data` that's not a file. In other words, `Oracpod` pipeline will **not** pass a data in memory from one node to another. Consequently, all operations and processing in Orcapod pipeline revolves around `file` (NOTE: this is a particularly strong/restrictive version of Oracpod pipeline. We may consider extending data to things like environment variable and command line arguments)\n", "* `Pathset` -- a unit of data that can be passed into a pod. A `pathset` consists of a file, a directory, or a collection of one or more file and directories.\n", "* `Packet` -- a single concrete instance of key-value pair, mapping packet key to a single `pathset`.\n", "* `Stream` -- a series of one or more `packets` flowing from a `data producer` to a `data consumer`. In a directed acyclic graph represneing an Orcapod `pipeline`, a `stream` corresponds to a *directed* edge connecting from a data source into a `data consumer` (e.g., `pod`)\n", diff --git a/notebooks/02_orcapod_basic_usage copy.ipynb b/notebooks/02_orcapod_basic_usage copy.ipynb new file mode 100644 index 0000000..ac98431 --- /dev/null +++ b/notebooks/02_orcapod_basic_usage copy.ipynb @@ -0,0 +1,2856 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Using Orcabridge" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this notebook, we will explore the basic usage of Orcabridge library." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Below we explore the usage of `orcabridge` package, enumerating the core components. Many of these will correspond directly to [core concepts](./01_orcabridge_core_concepts%20copy.ipynb) introduced in in [part 1](./01_orcabridge_core_concepts%20copy.ipynb)." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload\n", + "# import orcabridge package\n", + "import orcabridge as ob\n", + "import polars as pl\n", + "import pyarrow as pa" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from orcabridge.pod import TypedFunctionPod\n", + "from orcabridge.hashing.semantic_arrow_hasher import SemanticArrowHasher\n", + "from orcabridge.pod.core import CachedFunctionPod\n", + "from orcabridge.hashing.object_hashers import LegacyObjectHasher\n", + "from orcabridge.hashing.function_info_extractors import FunctionSignatureExtractor\n", + "from orcabridge.core.streams import SyncStreamFromLists\n", + "from orcabridge.types.registry import *" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "from orcabridge.store.arrow_data_stores import (\n", + " ParquetArrowDataStore,\n", + " demo_single_row_constraint,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:orcabridge.store.arrow_data_stores:Loading metadata index...\n", + "INFO:orcabridge.store.arrow_data_stores:Loaded metadata for 0 records\n", + "INFO:orcabridge.store.arrow_data_stores:Initialized lazy ParquetArrowDataStore at /tmp/tmprez6xpqd\n", + "WARNING:orcabridge.store.arrow_data_stores:User data contains 'entry_id' column. Consider removing it since '__entry_id' is tracked automatically.\n", + "INFO:orcabridge.store.arrow_data_stores:Added record experiments:dataset_A:entry_001_abcdef1234567890abcdef1234567890 with 1 rows\n", + "ERROR:orcabridge.store.arrow_data_stores:Schema mismatch for experiments/dataset_A:\n", + "ERROR:orcabridge.store.arrow_data_stores: Existing user columns: ['_user_entry_id', 'timestamp', 'value', 'category']\n", + "ERROR:orcabridge.store.arrow_data_stores: New user columns: ['entry_id', 'timestamp', 'value', 'category']\n", + "ERROR:orcabridge.store.arrow_data_stores: Missing in new: {'_user_entry_id'}\n", + "ERROR:orcabridge.store.arrow_data_stores: Extra in new: {'entry_id'}\n", + "INFO:orcabridge.store.arrow_data_stores:Shutting down ParquetArrowDataStore...\n", + "INFO:orcabridge.store.arrow_data_stores:Synced 1 dirty caches to disk\n", + "INFO:orcabridge.store.arrow_data_stores:Shutdown complete\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Testing Single-Row Constraint...\n", + "\n", + "=== Testing Valid Single-Row Records ===\n", + "✓ Added single-row record entry_001_abcdef... (value: 100.0)\n" + ] + }, + { + "ename": "ValueError", + "evalue": "Schema mismatch for experiments/dataset_A. Existing data has columns ['_user_entry_id', 'timestamp', 'value', 'category'], but new data has columns ['entry_id', 'timestamp', 'value', 'category']. All records in a source must have the same schema.", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mValueError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[4]\u001b[39m\u001b[32m, line 4\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mlogging\u001b[39;00m\n\u001b[32m 3\u001b[39m logging.basicConfig(level=logging.INFO)\n\u001b[32m----> \u001b[39m\u001b[32m4\u001b[39m \u001b[43mdemo_single_row_constraint\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/workspace/orcabridge/src/orcabridge/store/arrow_data_stores.py:959\u001b[39m, in \u001b[36mdemo_single_row_constraint\u001b[39m\u001b[34m()\u001b[39m\n\u001b[32m 957\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m i, entry_id \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(valid_entries):\n\u001b[32m 958\u001b[39m data = create_single_row_record(entry_id, value=\u001b[32m100.0\u001b[39m + i)\n\u001b[32m--> \u001b[39m\u001b[32m959\u001b[39m result = \u001b[43mstore\u001b[49m\u001b[43m.\u001b[49m\u001b[43madd_record\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mexperiments\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mdataset_A\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mentry_id\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdata\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 960\u001b[39m \u001b[38;5;28mprint\u001b[39m(\n\u001b[32m 961\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33m✓ Added single-row record \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mentry_id[:\u001b[32m16\u001b[39m]\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m... (value: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[32m100.0\u001b[39m\u001b[38;5;250m \u001b[39m+\u001b[38;5;250m \u001b[39mi\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m)\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 962\u001b[39m )\n\u001b[32m 964\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[33mTotal records stored: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(store._record_metadata)\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/workspace/orcabridge/src/orcabridge/store/arrow_data_stores.py:690\u001b[39m, in \u001b[36mParquetArrowDataStore.add_record\u001b[39m\u001b[34m(self, source_name, source_id, entry_id, arrow_data)\u001b[39m\n\u001b[32m 687\u001b[39m logger.error(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33m Missing in new: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mset\u001b[39m(existing_cols)\u001b[38;5;250m \u001b[39m-\u001b[38;5;250m \u001b[39m\u001b[38;5;28mset\u001b[39m(new_cols)\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m 688\u001b[39m logger.error(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33m Extra in new: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mset\u001b[39m(new_cols)\u001b[38;5;250m \u001b[39m-\u001b[38;5;250m \u001b[39m\u001b[38;5;28mset\u001b[39m(existing_cols)\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m--> \u001b[39m\u001b[32m690\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[32m 691\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mSchema mismatch for \u001b[39m\u001b[38;5;132;01m{\u001b[39;00msource_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m/\u001b[39m\u001b[38;5;132;01m{\u001b[39;00msource_id\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m. \u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 692\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mExisting data has columns \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mexisting_cols\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m, \u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 693\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mbut new data has columns \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mnew_cols\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m. \u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 694\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mAll records in a source must have the same schema.\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 695\u001b[39m )\n\u001b[32m 697\u001b[39m now = datetime.now()\n\u001b[32m 698\u001b[39m record_key = \u001b[38;5;28mself\u001b[39m._get_record_key(source_name, source_id, entry_id)\n", + "\u001b[31mValueError\u001b[39m: Schema mismatch for experiments/dataset_A. Existing data has columns ['_user_entry_id', 'timestamp', 'value', 'category'], but new data has columns ['entry_id', 'timestamp', 'value', 'category']. All records in a source must have the same schema." + ] + } + ], + "source": [ + "import logging\n", + "\n", + "logging.basicConfig(level=logging.INFO)\n", + "demo_single_row_constraint()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:orcabridge.store.arrow_data_stores:Loading metadata index...\n", + "INFO:orcabridge.store.arrow_data_stores:Loaded metadata for 1 records\n", + "INFO:orcabridge.store.arrow_data_stores:Initialized lazy ParquetArrowDataStore at ./data\n" + ] + } + ], + "source": [ + "from datetime import datetime, timedelta\n", + "\n", + "# Initialize store with single-row constraint enforcement\n", + "store = ParquetArrowDataStore(base_path=\"./data\", duplicate_entry_behavior=\"overwrite\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:orcabridge.store.arrow_data_stores:Added record experiments:dataset_A:entry_123d... with 1 rows\n" + ] + } + ], + "source": [ + "# This works - single row\n", + "single_row_data = pa.table({\"value\": [46.0], \"timestamp\": [datetime.now()]})\n", + "store.add_record(\"experiments\", \"dataset_A\", \"entry_1245d...\", single_row_data)\n", + "\n", + "# This fails - multiple rows\n", + "multi_row_data = pa.table({\"value\": [1.0, 2.0, 3.0], \"timestamp\": [datetime.now()] * 3})" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "store.force_sync()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/eywalker/workspace/orcabridge/src/orcabridge/store/arrow_data_stores.py:786: PerformanceWarning: Determining the column names of a LazyFrame requires resolving its schema, which is a potentially expensive operation. Use `LazyFrame.collect_schema().names()` to get the column names without this warning.\n", + " user_columns = [col for col in lazy_frame.columns if col not in system_cols]\n" + ] + } + ], + "source": [ + "data = store.get_all_records_as_polars(\"experiments\", \"dataset_A\")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (2, 2)
valuetimestamp
f64datetime[μs]
42.02025-06-16 09:05:29.641098
42.02025-06-16 09:06:46.449592
" + ], + "text/plain": [ + "shape: (2, 2)\n", + "┌───────┬────────────────────────────┐\n", + "│ value ┆ timestamp │\n", + "│ --- ┆ --- │\n", + "│ f64 ┆ datetime[μs] │\n", + "╞═══════╪════════════════════════════╡\n", + "│ 42.0 ┆ 2025-06-16 09:05:29.641098 │\n", + "│ 42.0 ┆ 2025-06-16 09:06:46.449592 │\n", + "└───────┴────────────────────────────┘" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.collect()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (0, 3)
valuetimestampentry_id
f64datetime[μs]cat
" + ], + "text/plain": [ + "shape: (0, 3)\n", + "┌───────┬──────────────┬──────────┐\n", + "│ value ┆ timestamp ┆ entry_id │\n", + "│ --- ┆ --- ┆ --- │\n", + "│ f64 ┆ datetime[μs] ┆ cat │\n", + "╞═══════╪══════════════╪══════════╡\n", + "└───────┴──────────────┴──────────┘" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.collect()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "ename": "ValueError", + "evalue": "Each record must contain exactly 1 row, got 3 rows. This constraint ensures that for each source_name/source_id combination, there is only one valid entry per entry_id.", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mValueError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[9]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[43mstore\u001b[49m\u001b[43m.\u001b[49m\u001b[43madd_record\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mexperiments\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mdataset_A\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mentry_123abc...\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmulti_row_data\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/workspace/orcabridge/src/orcabridge/store/arrow_data_stores.py:541\u001b[39m, in \u001b[36mParquetArrowDataStore.add_record\u001b[39m\u001b[34m(self, source_name, source_id, entry_id, arrow_data)\u001b[39m\n\u001b[32m 539\u001b[39m \u001b[38;5;66;03m# CRITICAL: Enforce single-row constraint\u001b[39;00m\n\u001b[32m 540\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(arrow_data) != \u001b[32m1\u001b[39m:\n\u001b[32m--> \u001b[39m\u001b[32m541\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[32m 542\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mEach record must contain exactly 1 row, got \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(arrow_data)\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m rows. \u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 543\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mThis constraint ensures that for each source_name/source_id combination, \u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 544\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mthere is only one valid entry per entry_id.\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 545\u001b[39m )\n\u001b[32m 547\u001b[39m \u001b[38;5;66;03m# Validate entry_id format (assuming 8+ char identifier)\u001b[39;00m\n\u001b[32m 548\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m entry_id \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(entry_id) < \u001b[32m8\u001b[39m:\n", + "\u001b[31mValueError\u001b[39m: Each record must contain exactly 1 row, got 3 rows. This constraint ensures that for each source_name/source_id combination, there is only one valid entry per entry_id." + ] + } + ], + "source": [ + "store.add_record(\"experiments\", \"dataset_A\", \"entry_123abc...\", multi_row_data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:orcabridge.store.arrow_data_stores:Loading metadata index...\n", + "INFO:orcabridge.store.arrow_data_stores:Loaded metadata for 0 records\n", + "INFO:orcabridge.store.arrow_data_stores:Initialized lazy ParquetArrowDataStore at ./dataset\n" + ] + } + ], + "source": [ + "data_store = ParquetArrowDataStore(\n", + " \"./dataset\", sync_interval_seconds=10, max_loaded_sources=30\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data_store.add_record()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "from orcabridge.types.default import default_registry\n", + "from orcabridge.types.registry import PacketConverter\n", + "\n", + "type_spec = {\"name\": str, \"file\": Path}\n", + "example_packet = {\"name\": \"Edgar\", \"file\": \"sample.txt\"}\n", + "example_packets = [\n", + " {\"name\": \"Edgar\", \"file\": \"sample.txt\"},\n", + " {\"name\": \"Alice\", \"file\": \"sample2.txt\"},\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "converter = PacketConverter(type_spec, registry=default_registry)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "table = converter.to_arrow_table(example_packets)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "pyarrow.Table\n", + "name: string\n", + "file: string\n", + "----\n", + "name: [[\"Edgar\",\"Alice\"]]\n", + "file: [[\"sample.txt\",\"sample2.txt\"]]" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "table" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'2cda22e1527ecf01d1555ed90a0f0a2b40ec7f1034387b5f0f93afb38cf041cf'" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "hasher.hash_table(table)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "processed_table = hasher._process_table_columns(table)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "import inspect" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def test(a: int = 5):\n", + " return 5" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "a a: int = 5\n" + ] + } + ], + "source": [ + "for k, v in inspect.signature(test).parameters.items():\n", + " print(k, v)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'a'" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "v.name" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "pyarrow.Table\n", + "name: string\n", + "file: string\n", + "----\n", + "name: [[\"Edgar\",\"Alice\"]]\n", + "file: [[\"3a1f868f16c70867afdff05d9c7de3a6e573d2ade1ce6a48293d973f8ad68504\",\"3a1f868f16c70867afdff05d9c7de3a6e573d2ade1ce6a48293d973f8ad68504\"]]" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "processed_table" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "z = hasher._sort_table_columns(processed_table)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "pyarrow.Table\n", + "name: string\n", + "file: string\n", + "----\n", + "name: [[\"Edgar\"]]\n", + "file: [[\"f8efdb6bc4c7dc8eb7b439ba9b3d132733f0e73c4aa83748bb13f25133a43633\"]]" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "processed_table" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'99ca6c8b436a17d888d65051ba7977eeec93323746e619db5b1c9ab53171566d'" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "hasher.hash_table(table)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'35d6c3043e95f2176c36188849655ac2a412562c6ccfcbec3a7a8a29a3a1eb44'" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "hasher.hash_table(table)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "function_info_extractor = FunctionSignatureExtractor()\n", + "object_hasher = LegacyObjectHasher(function_info_extractor=function_info_extractor)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "def product(x: float, y: int) -> float:\n", + " return x * y" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "class ArrowPacketHasher:\n", + " def hash_packet(self, packet):\n", + " print(f\"Requested to hash packet {packet}\")\n", + " return \"test\"\n", + "\n", + " def hash_arrow_packet(self, packet):\n", + " print(f\"Requested to hash arrow packet {packet}\")\n", + " return \"test_arrow\"\n", + "\n", + "\n", + "class MyArrowDataStore:\n", + " def add_record(\n", + " self,\n", + " source_name: str,\n", + " source_id: str,\n", + " entry_id: str,\n", + " arrow_data: pa.Table,\n", + " ) -> pa.Table:\n", + " print(\n", + " f\"Adding record to Arrow data store: {source_name}, {source_id}, {entry_id}: {arrow_data}\"\n", + " )\n", + " return arrow_data\n", + "\n", + " def get_record(\n", + " self, source_name: str, source_id: str, entry_id: str\n", + " ) -> pa.Table | None:\n", + " return None\n", + "\n", + " def get_all_records(self, source_name: str, source_id: str) -> pa.Table | None:\n", + " \"\"\"Retrieve all records for a given source as a single table.\"\"\"\n", + " return None\n", + "\n", + " def get_all_records_as_polars(\n", + " self, source_name: str, source_id: str\n", + " ) -> pl.LazyFrame | None:\n", + " \"\"\"Retrieve all records for a given source as a single Polars DataFrame.\"\"\"\n", + " return None\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pod = TypedFunctionPod(product, \"result\")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "('TypedFunctionPod',\n", + " float>,\n", + " ('result',))" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pod.identity_structure()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "stream = SyncStreamFromLists(\n", + " [\n", + " {\"id\": 0},\n", + " {\"id\": 1},\n", + " {\"id\": 2},\n", + " ],\n", + " [{\"x\": 3.0, \"y\": 4}, {\"x\": 5.0, \"y\": 6}, {\"x\": 7.0, \"y\": 8}],\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tag: {'id': 0}, Packet: {'result': 12.0}\n", + "Tag: {'id': 1}, Packet: {'result': 30.0}\n", + "Tag: {'id': 2}, Packet: {'result': 56.0}\n" + ] + } + ], + "source": [ + "for tag, packet in pod(stream):\n", + " print((f\"Tag: {tag}, Packet: {packet}\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'result': 56.0}]" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pod.output_converter.from_arrow_table(pod.output_converter.to_arrow_table(packet))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cached_pod = CachedFunctionPod(\n", + " pod,\n", + " object_hasher=object_hasher,\n", + " arrow_hasher=ArrowPacketHasher(),\n", + " result_store=MyArrowDataStore(),\n", + " tag_store=MyArrowDataStore(),\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "12.0" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cached_pod.function_pod.function(3.0, 4)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "12.0" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cached_pod.function_pod.function(3.0, 4)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requested to hash arrow packet pyarrow.Table\n", + "x: double\n", + "y: int64\n", + "----\n", + "x: [[3]]\n", + "y: [[4]]\n", + "Requested to hash arrow packet pyarrow.Table\n", + "id: int64\n", + "__packet_key: string\n", + "----\n", + "id: [[0]]\n", + "__packet_key: [[\"test_arrow\"]]\n", + "Adding record to Arrow data store: product, e9487308f083ecc170e2fe679ce0d30b70e7c9d7c59b532118944e461e40ba1f, test_arrow: pyarrow.Table\n", + "id: int64\n", + "__packet_key: string\n", + "----\n", + "id: [[0]]\n", + "__packet_key: [[\"test_arrow\"]]\n", + "Requested to hash arrow packet pyarrow.Table\n", + "x: double\n", + "y: int64\n", + "----\n", + "x: [[3]]\n", + "y: [[4]]\n", + "Adding record to Arrow data store: product, e9487308f083ecc170e2fe679ce0d30b70e7c9d7c59b532118944e461e40ba1f, test_arrow: pyarrow.Table\n", + "result: double\n", + "----\n", + "result: [[12]]\n", + "Tag: {'id': 0}, Packet: {'result': 12.0}\n", + "Requested to hash arrow packet pyarrow.Table\n", + "x: double\n", + "y: int64\n", + "----\n", + "x: [[5]]\n", + "y: [[6]]\n", + "Requested to hash arrow packet pyarrow.Table\n", + "id: int64\n", + "__packet_key: string\n", + "----\n", + "id: [[1]]\n", + "__packet_key: [[\"test_arrow\"]]\n", + "Adding record to Arrow data store: product, e9487308f083ecc170e2fe679ce0d30b70e7c9d7c59b532118944e461e40ba1f, test_arrow: pyarrow.Table\n", + "id: int64\n", + "__packet_key: string\n", + "----\n", + "id: [[1]]\n", + "__packet_key: [[\"test_arrow\"]]\n", + "Requested to hash arrow packet pyarrow.Table\n", + "x: double\n", + "y: int64\n", + "----\n", + "x: [[5]]\n", + "y: [[6]]\n", + "Adding record to Arrow data store: product, e9487308f083ecc170e2fe679ce0d30b70e7c9d7c59b532118944e461e40ba1f, test_arrow: pyarrow.Table\n", + "result: double\n", + "----\n", + "result: [[30]]\n", + "Tag: {'id': 1}, Packet: {'result': 30.0}\n", + "Requested to hash arrow packet pyarrow.Table\n", + "x: double\n", + "y: int64\n", + "----\n", + "x: [[7]]\n", + "y: [[8]]\n", + "Requested to hash arrow packet pyarrow.Table\n", + "id: int64\n", + "__packet_key: string\n", + "----\n", + "id: [[2]]\n", + "__packet_key: [[\"test_arrow\"]]\n", + "Adding record to Arrow data store: product, e9487308f083ecc170e2fe679ce0d30b70e7c9d7c59b532118944e461e40ba1f, test_arrow: pyarrow.Table\n", + "id: int64\n", + "__packet_key: string\n", + "----\n", + "id: [[2]]\n", + "__packet_key: [[\"test_arrow\"]]\n", + "Requested to hash arrow packet pyarrow.Table\n", + "x: double\n", + "y: int64\n", + "----\n", + "x: [[7]]\n", + "y: [[8]]\n", + "Adding record to Arrow data store: product, e9487308f083ecc170e2fe679ce0d30b70e7c9d7c59b532118944e461e40ba1f, test_arrow: pyarrow.Table\n", + "result: double\n", + "----\n", + "result: [[56]]\n", + "Tag: {'id': 2}, Packet: {'result': 56.0}\n" + ] + } + ], + "source": [ + "for tag, packet in cached_pod(stream):\n", + " print((f\"Tag: {tag}, Packet: {packet}\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "\n", + "\n", + "@typed_function_pod([\"sum\", \"difference\", \"info_path\"])\n", + "def add_and_subtract(a: int, b: int) -> tuple[int, int, Path]:\n", + " \"\"\"\n", + " Adds and subtracts two integers.\n", + "\n", + " :param a: First integer.\n", + " :param b: Second integer.\n", + " :return: A tuple containing the sum and the difference of a and b.\n", + " \"\"\"\n", + " return a + b, a - b, Path(\"local_info.json\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "stream = SyncStreamFromLists(\n", + " [{\"name\": \"Edgar\"}, {\"name\": \"Alice\"}, {\"name\": \"Bob\"}],\n", + " [{\"a\": 5, \"b\": 3}, {\"a\": 10, \"b\": 2}, {\"a\": 7, \"b\": 4}],\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "converter = PacketConverter(\n", + " add_and_subtract.function_output_types, add_and_subtract.registry\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "packets = [p for t, p in add_and_subtract(stream).flow()]" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "table = converter.to_arrow_table(packets)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "pyarrow.Table\n", + "sum: int64\n", + "difference: int64\n", + "info_path: string\n", + "----\n", + "sum: [[8,12,11]]\n", + "difference: [[2,8,3]]\n", + "info_path: [[\"local_info.json\",\"local_info.json\",\"local_info.json\"]]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "table" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "import pyarrow as pa" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "tag = {\"name\": [\"Edgar\", \"Names\"], \"age\": 37}" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "tag[\"__packet_key\"] = \"some_unique_key\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'51d2a5483d1623ba4582fb372a73bb7726caf1a9756af32778cbac1b8c16f6c1'" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from orcabridge.hashing.defaults import LegacyObjectHasher\n", + "\n", + "LegacyObjectHasher().hash_to_hex(tag)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "pyarrow.Table\n", + "name: list\n", + " child 0, item: string\n", + "age: int64\n", + "__packet_key: string\n", + "----\n", + "name: [[[\"Edgar\",\"Names\"]]]\n", + "age: [[37]]\n", + "__packet_key: [[\"some_unique_key\"]]" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pa.Table.from_pylist([tag])" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'sum': 8, 'difference': 2, 'info_path': PosixPath('local_info.json')},\n", + " {'sum': 12, 'difference': 8, 'info_path': PosixPath('local_info.json')},\n", + " {'sum': 11, 'difference': 3, 'info_path': PosixPath('local_info.json')}]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "converter.from_arrow_table(table)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{}\n", + "{}\n", + "{b'semantic_type': b'path'}\n" + ] + } + ], + "source": [ + "for field in table.schema:\n", + " print(field.metadata)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "import pyarrow as pa" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "ename": "AttributeError", + "evalue": "module 'pyarrow' has no attribute 'type'", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mAttributeError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[13]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[43mpa\u001b[49m\u001b[43m.\u001b[49m\u001b[43mtype\u001b[49m(\u001b[38;5;28mint\u001b[39m)\n", + "\u001b[31mAttributeError\u001b[39m: module 'pyarrow' has no attribute 'type'" + ] + } + ], + "source": [ + "pa.type(int)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "pyarrow.Table\n", + "sum: int64\n", + "difference: int64\n", + "info_path: string\n", + "----\n", + "sum: [[8]]\n", + "difference: [[2]]\n", + "info_path: [[\"local_info.json\"]]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "table = converter.to_arrow_table(packets[0])\n", + "table" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pyarrow as pa\n", + "import time\n", + "\n", + "\n", + "def benchmark_conversions():\n", + " # Create test tables\n", + " single_row = pa.table({\"x\": [42], \"y\": [\"hello\"], \"z\": [3.14]})\n", + " multi_row = pa.table(\n", + " {\"x\": list(range(1000)), \"y\": [f\"item_{i}\" for i in range(1000)]}\n", + " )\n", + "\n", + " # Method 1: to_pydict() + post-processing\n", + " def method1(table):\n", + " pydict = table.to_pydict()\n", + " if len(table) == 1:\n", + " return {key: values[0] for key, values in pydict.items()}\n", + " return pydict\n", + "\n", + " # Method 2: Direct scalar extraction\n", + " def method2(table):\n", + " if len(table) == 1:\n", + " return {\n", + " col_name: table.column(col_name)[0].as_py()\n", + " for col_name in table.column_names\n", + " }\n", + " return table.to_pydict()\n", + "\n", + " # Method 3: Using pandas intermediate (generally slower)\n", + " def method3(table):\n", + " df = table.to_pandas()\n", + " if len(df) == 1:\n", + " return df.iloc[0].to_dict()\n", + " return df.to_dict(\"list\")\n", + "\n", + " # Benchmark single row\n", + " print(\"Single row benchmarks:\")\n", + " for i, method in enumerate([method1, method2, method3], 1):\n", + " start = time.time()\n", + " for _ in range(10000):\n", + " result = method(single_row)\n", + " end = time.time()\n", + " print(f\"Method {i}: {end - start:.4f}s\")\n", + "\n", + " print(\"\\nMulti-row benchmarks:\")\n", + " for i, method in enumerate([method1, method2, method3], 1):\n", + " start = time.time()\n", + " for _ in range(1000):\n", + " result = method(multi_row)\n", + " end = time.time()\n", + " print(f\"Method {i}: {end - start:.4f}s\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Single row benchmarks:\n", + "Method 1: 0.3935s\n", + "Method 2: 0.2222s\n", + "Method 3: 4.3977s\n", + "\n", + "Multi-row benchmarks:\n", + "Method 1: 2.4562s\n", + "Method 2: 2.4976s\n", + "Method 3: 1.6264s\n" + ] + } + ], + "source": [ + "benchmark_conversions()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'sum': 8, 'difference': 2, 'info_path': PosixPath('local_info.json')}]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "converter.from_arrow_table(table)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "b\"semantic_type\" in table.schema[2].metadata" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "k" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'sum': TypeInfo(python_type=, arrow_type=DataType(int64), semantic_type='int'),\n", + " 'difference': TypeInfo(python_type=, arrow_type=DataType(int64), semantic_type='int'),\n", + " 'info_path': TypeInfo(python_type=, arrow_type=DataType(string), semantic_type='path')}" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "converter.storage_type_info" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'sum': 8, 'difference': 2, 'info_path': PosixPath('local_info.json')},\n", + " {'sum': 12, 'difference': 8, 'info_path': PosixPath('local_info.json')},\n", + " {'sum': 11, 'difference': 3, 'info_path': PosixPath('local_info.json')}]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "packets" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [], + "source": [ + "store_data = to_store(packet)" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [], + "source": [ + "arrow_packet = convert_packet_to_arrow_table(\n", + " packet, add_and_subtract.function_output_types, add_and_subtract.registry\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'convert_arrow_tablet_to_packet' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mNameError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[46]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[43mconvert_arrow_tablet_to_packet\u001b[49m\n", + "\u001b[31mNameError\u001b[39m: name 'convert_arrow_tablet_to_packet' is not defined" + ] + } + ], + "source": [ + "convert_arrow_tablet_to_packet" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "sum: int64\n", + "difference: int64\n", + "info_path: string" + ] + }, + "execution_count": 71, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "arrow_packet.schema" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "with pa.OSFile(\"arraydata.arrow\", \"wb\") as sink:\n", + " with pa.ipc.new_file(sink, schema=arrow_packet.schema) as writer:\n", + " writer.write(batch)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "ename": "AttributeError", + "evalue": "module 'pyarrow' has no attribute 'save_table'", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mAttributeError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[70]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[43mpa\u001b[49m\u001b[43m.\u001b[49m\u001b[43msave_table\u001b[49m(arrow_packet, \u001b[33m\"\u001b[39m\u001b[33moutput.arrow\u001b[39m\u001b[33m\"\u001b[39m)\n", + "\u001b[31mAttributeError\u001b[39m: module 'pyarrow' has no attribute 'save_table'" + ] + } + ], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "pyarrow.Table\n", + "sum: int64\n", + "difference: int64\n", + "info_path: string\n", + "----\n", + "sum: [[8]]\n", + "difference: [[2]]\n", + "info_path: [[\"local_info.json\"]]" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "arrow_packet" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [], + "source": [ + "from deltalake import write_deltalake, DeltaTable\n", + "\n", + "write_deltalake(\"tmp/another-table\", arrow_packet, mode=\"append\")\n", + "\n", + "table = DeltaTable(\"tmp/another-table\")" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (1, 3)
sumdifferenceinfo_path
i64i64str
82"local_info.json"
" + ], + "text/plain": [ + "shape: (1, 3)\n", + "┌─────┬────────────┬─────────────────┐\n", + "│ sum ┆ difference ┆ info_path │\n", + "│ --- ┆ --- ┆ --- │\n", + "│ i64 ┆ i64 ┆ str │\n", + "╞═════╪════════════╪═════════════════╡\n", + "│ 8 ┆ 2 ┆ local_info.json │\n", + "└─────┴────────────┴─────────────────┘" + ] + }, + "execution_count": 55, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pl.DataFrame(table.to_pyarrow_table())" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "import polars as pl" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "ename": "AttributeError", + "evalue": "'deltalake._internal.Schema' object has no attribute 'to_pyarrow'", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mAttributeError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[24]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[43mpl\u001b[49m\u001b[43m.\u001b[49m\u001b[43mread_delta\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mtmp/some-table\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/workspace/orcabridge/.venv/lib/python3.13/site-packages/polars/io/delta.py:149\u001b[39m, in \u001b[36mread_delta\u001b[39m\u001b[34m(source, version, columns, rechunk, storage_options, credential_provider, delta_table_options, use_pyarrow, pyarrow_options)\u001b[39m\n\u001b[32m 26\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mread_delta\u001b[39m(\n\u001b[32m 27\u001b[39m source: \u001b[38;5;28mstr\u001b[39m | DeltaTable,\n\u001b[32m 28\u001b[39m *,\n\u001b[32m (...)\u001b[39m\u001b[32m 36\u001b[39m pyarrow_options: \u001b[38;5;28mdict\u001b[39m[\u001b[38;5;28mstr\u001b[39m, Any] | \u001b[38;5;28;01mNone\u001b[39;00m = \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[32m 37\u001b[39m ) -> DataFrame:\n\u001b[32m 38\u001b[39m \u001b[38;5;250m \u001b[39m\u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 39\u001b[39m \u001b[33;03m Reads into a DataFrame from a Delta lake table.\u001b[39;00m\n\u001b[32m 40\u001b[39m \n\u001b[32m (...)\u001b[39m\u001b[32m 147\u001b[39m \u001b[33;03m ... ) # doctest: +SKIP\u001b[39;00m\n\u001b[32m 148\u001b[39m \u001b[33;03m \"\"\"\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m149\u001b[39m df = \u001b[43mscan_delta\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 150\u001b[39m \u001b[43m \u001b[49m\u001b[43msource\u001b[49m\u001b[43m=\u001b[49m\u001b[43msource\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 151\u001b[39m \u001b[43m \u001b[49m\u001b[43mversion\u001b[49m\u001b[43m=\u001b[49m\u001b[43mversion\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 152\u001b[39m \u001b[43m \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[43m=\u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 153\u001b[39m \u001b[43m \u001b[49m\u001b[43mcredential_provider\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcredential_provider\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 154\u001b[39m \u001b[43m \u001b[49m\u001b[43mdelta_table_options\u001b[49m\u001b[43m=\u001b[49m\u001b[43mdelta_table_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 155\u001b[39m \u001b[43m \u001b[49m\u001b[43muse_pyarrow\u001b[49m\u001b[43m=\u001b[49m\u001b[43muse_pyarrow\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 156\u001b[39m \u001b[43m \u001b[49m\u001b[43mpyarrow_options\u001b[49m\u001b[43m=\u001b[49m\u001b[43mpyarrow_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 157\u001b[39m \u001b[43m \u001b[49m\u001b[43mrechunk\u001b[49m\u001b[43m=\u001b[49m\u001b[43mrechunk\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 158\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 160\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m columns \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m 161\u001b[39m df = df.select(columns)\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/workspace/orcabridge/.venv/lib/python3.13/site-packages/polars/io/delta.py:376\u001b[39m, in \u001b[36mscan_delta\u001b[39m\u001b[34m(***failed resolving arguments***)\u001b[39m\n\u001b[32m 372\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m DeltaProtocolError(msg)\n\u001b[32m 374\u001b[39m \u001b[38;5;66;03m# Requires conversion through pyarrow table because there is no direct way yet to\u001b[39;00m\n\u001b[32m 375\u001b[39m \u001b[38;5;66;03m# convert a delta schema into a polars schema\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m376\u001b[39m delta_schema = \u001b[43mdl_tbl\u001b[49m\u001b[43m.\u001b[49m\u001b[43mschema\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m.\u001b[49m\u001b[43mto_pyarrow\u001b[49m(as_large_types=\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[32m 377\u001b[39m polars_schema = from_arrow(pa.Table.from_pylist([], delta_schema)).schema \u001b[38;5;66;03m# type: ignore[union-attr]\u001b[39;00m\n\u001b[32m 378\u001b[39m partition_columns = dl_tbl.metadata().partition_columns\n", + "\u001b[31mAttributeError\u001b[39m: 'deltalake._internal.Schema' object has no attribute 'to_pyarrow'" + ] + } + ], + "source": [ + "pl." + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (3, 2)
sumdifference
i64i64
82
82
82
" + ], + "text/plain": [ + "shape: (3, 2)\n", + "┌─────┬────────────┐\n", + "│ sum ┆ difference │\n", + "│ --- ┆ --- │\n", + "│ i64 ┆ i64 │\n", + "╞═════╪════════════╡\n", + "│ 8 ┆ 2 │\n", + "│ 8 ┆ 2 │\n", + "│ 8 ┆ 2 │\n", + "└─────┴────────────┘" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pl.DataFrame(table.to_pyarrow_table())" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "pyarrow.Table\n", + "sum: int64\n", + "difference: int64\n", + "----\n", + "sum: [[8],[8],[8]]\n", + "difference: [[2],[2],[2]]" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "table.to_pyarrow_table()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "read_" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "add_and_subtract.registry" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[({'name': 'Edgar'}, {'sum': 8, 'difference': 2}),\n", + " ({'name': 'Alice'}, {'sum': 12, 'difference': 8}),\n", + " ({'name': 'Bob'}, {'sum': 11, 'difference': 3})]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "add_and_subtract(stream).flow()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "FunctionPod:module:__main__ name:_original_add_and_subtract params:(a: int, b: int) returns:tuple[int, int] ⇒ ['sum', 'difference']" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "add_and_subtract" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "add_and_subtract." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Working with streams" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`Stream` is fundamental to Orcapod data pipeline, representing *edges* in a directed acyclic graph (DAG) of an Orcapod pipeline. `Stream` is best thought of as a flowing stream of `packets` -- a unit of data in Oracpod. A `packet` is essentially a ditionary mapping argument names to a `pathset` (that is, one or more files with arbitrary nesting). Ultimately, a pod will receive and work on the `packet`, looking up the pathset that matches the expected argument names defined as the inputs into the pod. Before we explore creating and using `pod`, we will create a very basic `stream` called `GlobStream`, sourcing from a directory. A packet is formed for each file that matches the specified *glob* pattern." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's create a data source out of all `*.txt` files found in the folder `examples/dataset1`" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[0m\u001b[01;32mday1.txt\u001b[0m* \u001b[01;32mday2.txt\u001b[0m* \u001b[01;32mday3.txt\u001b[0m* \u001b[01;32mday4.txt\u001b[0m* \u001b[01;32mday6.txt\u001b[0m*\n" + ] + } + ], + "source": [ + "%ls ../examples/dataset1" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "dataset1 = ob.GlobSource(\"txt_file\", \"../examples/dataset1\", \"*.txt\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can then obtain `stream` from a `source` by invoking the source with `Source()`. The return `stream` acts as an iterator over the `packet` and its `tag`.\n", + "For convenience, `source` can be treated synonymously with a `stream`, allowing you to directly iterate over the content." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Packet {'txt_file': PosixPath('../examples/dataset1/day1.txt')} with tag {'file_name': 'day1'}\n", + "Packet {'txt_file': PosixPath('../examples/dataset1/day2.txt')} with tag {'file_name': 'day2'}\n", + "Packet {'txt_file': PosixPath('../examples/dataset1/day3.txt')} with tag {'file_name': 'day3'}\n", + "Packet {'txt_file': PosixPath('../examples/dataset1/day4.txt')} with tag {'file_name': 'day4'}\n", + "Packet {'txt_file': PosixPath('../examples/dataset1/day6.txt')} with tag {'file_name': 'day6'}\n" + ] + } + ], + "source": [ + "for tag, packet in dataset1():\n", + " print(f\"Packet {packet} with tag {tag}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Packet {'txt_file': PosixPath('../examples/dataset1/day1.txt')} with tag {'file_name': 'day1'}\n", + "Packet {'txt_file': PosixPath('../examples/dataset1/day2.txt')} with tag {'file_name': 'day2'}\n", + "Packet {'txt_file': PosixPath('../examples/dataset1/day3.txt')} with tag {'file_name': 'day3'}\n", + "Packet {'txt_file': PosixPath('../examples/dataset1/day4.txt')} with tag {'file_name': 'day4'}\n", + "Packet {'txt_file': PosixPath('../examples/dataset1/day6.txt')} with tag {'file_name': 'day6'}\n" + ] + } + ], + "source": [ + "# equivalent to above but more natural without the need to call `dataset1()`\n", + "for tag, packet in dataset1:\n", + " print(f\"Packet {packet} with tag {tag}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A few things to note. When creating the `GlobSource` we pass in the argument name to be associated with the `pathset` matching our glob pattern (`*.txt` in this case). By default, the `GlobSource` tags each packet with a key of `file_name` and value of the name of the file that was matched (minus the file extension). This behavior can be easily changed by passing in a custom function for tag generation at the time of `GlobSource` creation." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "\n", + "dataset1_custom = ob.GlobSource(\n", + " \"data\",\n", + " \"../examples/dataset1\",\n", + " \"*.txt\",\n", + " tag_function=lambda x: {\"date\": Path(x).stem},\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Packet {'data': PosixPath('../examples/dataset1/day1.txt')} with tag {'date': 'day1'}\n", + "Packet {'data': PosixPath('../examples/dataset1/day2.txt')} with tag {'date': 'day2'}\n", + "Packet {'data': PosixPath('../examples/dataset1/day3.txt')} with tag {'date': 'day3'}\n", + "Packet {'data': PosixPath('../examples/dataset1/day4.txt')} with tag {'date': 'day4'}\n", + "Packet {'data': PosixPath('../examples/dataset1/day6.txt')} with tag {'date': 'day6'}\n" + ] + } + ], + "source": [ + "for tag, packet in dataset1_custom:\n", + " print(f\"Packet {packet} with tag {tag}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Custom tag function would allow one to extract information useful in controlling the flow of the data pipeline from the file path or even the file content. We will return to this a bit later." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In general, a packet is generated and starts flowing into a `stream` **only** when you ask for it by iterating through the elements. This allows for a series of streams and pods to be chained together without immediately invoking any computation." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's go ahead and load another source from a folder containing multiple `*.bin` files, representing data collected on different days." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Packet {'bin_data': PosixPath('../examples/dataset2/session_day1.bin')} with tag {'file_name': 'session_day1'}\n", + "Packet {'bin_data': PosixPath('../examples/dataset2/session_day3.bin')} with tag {'file_name': 'session_day3'}\n", + "Packet {'bin_data': PosixPath('../examples/dataset2/session_day4.bin')} with tag {'file_name': 'session_day4'}\n", + "Packet {'bin_data': PosixPath('../examples/dataset2/session_day5.bin')} with tag {'file_name': 'session_day5'}\n" + ] + } + ], + "source": [ + "dataset2 = ob.GlobSource(\"bin_data\", \"../examples/dataset2\", \"*.bin\")\n", + "\n", + "for tag, packet in dataset2:\n", + " print(f\"Packet {packet} with tag {tag}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we have two streams to work with, let's explore how we can manipulate/control the flow of streams using `operations` and, specifically, `mapper` operations." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Manipulating streams with `operations`" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As defined ealier in the [core concepts](./01_orcabridge_core_concepts%20copy.ipynb#core-concepts), we refer to any computation/transformation that works on stream(s) as `operations` in the pipeline. If the Orcapod pipeline were to be viewed as a DAG, the `streams` would be the edges connecting *nodes* that are the `operations`. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`Operations` can be divided into three categories based on their roles in the processing and manipulating streams. `Source`, `Mappers` and `Pods`. We have already seen an example of `Source` earlier when we worked with `GlobSource`. Officially, `Source` is an `operation` that produces a `stream` without taking in any inputs. They are best thought of as entry points of data into the pipeline.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "`Mappers` are `operations` that controls and alter the streams but *without generating or modifying new data files*. As we will see shortly, `mappers` work to alter the stream by alterning packet tags and/or packet content, but critically will never create or modify new files that were not already present somewhere in the stream feeding into the `mapper` node. While this might sound like an unnecessary restriction on what `mappers` can do, we will see that this property guarantees that *mappers can not ever alter the reproducibility of computational chains*." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The third category of `operations` are `Pods`, these operations are **allowed to generate and flow new files into the streams** *based on* inputs they receive from other streams. Aside from `Source`, which takes no inputs, `Pods` are the only operations that can introduce new files into the stream.\n", + "\n", + "We will explore pods in great detail later. First let's get to know `mappers`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Controling data streams with `Mappers`" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once you have created a `source` from which streams can be formed, you can alter the stream by applying various `mappers`. More precisely, a `mapper` can work on tags and/or packets." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Map packets\n", + "Likely one of the most common mapper operation to be found in Orcapod pipeline is `MapPackets` mapper. As the name implies, it let's you alter the keys (argument names) found in the `packet`." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Before mapping:\n", + "Packet {'txt_file': PosixPath('../examples/dataset1/day1.txt')} with tag {'file_name': 'day1'}\n", + "Packet {'txt_file': PosixPath('../examples/dataset1/day2.txt')} with tag {'file_name': 'day2'}\n", + "Packet {'txt_file': PosixPath('../examples/dataset1/day3.txt')} with tag {'file_name': 'day3'}\n", + "Packet {'txt_file': PosixPath('../examples/dataset1/day4.txt')} with tag {'file_name': 'day4'}\n", + "Packet {'txt_file': PosixPath('../examples/dataset1/day6.txt')} with tag {'file_name': 'day6'}\n", + "After mapping:\n", + "Mapped Packet {'content': PosixPath('../examples/dataset1/day1.txt')} with tag {'file_name': 'day1'}\n", + "Mapped Packet {'content': PosixPath('../examples/dataset1/day2.txt')} with tag {'file_name': 'day2'}\n", + "Mapped Packet {'content': PosixPath('../examples/dataset1/day3.txt')} with tag {'file_name': 'day3'}\n", + "Mapped Packet {'content': PosixPath('../examples/dataset1/day4.txt')} with tag {'file_name': 'day4'}\n", + "Mapped Packet {'content': PosixPath('../examples/dataset1/day6.txt')} with tag {'file_name': 'day6'}\n" + ] + } + ], + "source": [ + "print(\"Before mapping:\")\n", + "for tag, packet in dataset1:\n", + " print(f\"Packet {packet} with tag {tag}\")\n", + "\n", + "\n", + "# create a new stream mapping packet keys 'txt_file' to 'content'\n", + "packet_mapper = ob.MapPackets(key_map={\"txt_file\": \"content\"})\n", + "\n", + "print(\"After mapping:\")\n", + "for tag, packet in packet_mapper(dataset1):\n", + " print(f\"Mapped Packet {packet} with tag {tag}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You'd notice that for each packet, the key `txt_file` was replaced with `content` without altering the pointed `path` or the associated tag. As the keys of the packets will be used as the name of arguments when invoking pods on a stream, we will see that `MapPackets` are commonly used to *map* the correct path to the argument." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Map tags\n", + "As we have already seen, each packet in the stream is associated with a tag, often derived from the data source. In the case of `GlobFileSource`, the tags are by default the name of the file that formed the packet. These tags are used to *transiently* identify the packet and will be used when matching packets across multiple streams (as we will see shortly in `Join` operation). You can manipulate the tags using `MapTags` operation, much like `MapKeys` but operating on the tags for each packaet under a uniform renaming rule." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'day': 'day1'} {'txt_file': PosixPath('../examples/dataset1/day1.txt')}\n", + "{'day': 'day2'} {'txt_file': PosixPath('../examples/dataset1/day2.txt')}\n", + "{'day': 'day3'} {'txt_file': PosixPath('../examples/dataset1/day3.txt')}\n", + "{'day': 'day4'} {'txt_file': PosixPath('../examples/dataset1/day4.txt')}\n", + "{'day': 'day6'} {'txt_file': PosixPath('../examples/dataset1/day6.txt')}\n" + ] + } + ], + "source": [ + "tag_mapper = ob.MapTags(key_map={\"file_name\": \"day\"})\n", + "\n", + "for tag, packet in tag_mapper(dataset1):\n", + " print(tag, packet)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Chaining operations" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As you might expect, you can chain multiple operations one after another to construct a more complex stream. Below, we first apply the key mapping and then map tags." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Mapped Packet {'content': PosixPath('../examples/dataset1/day1.txt')} with tag {'day': 'day1'}\n", + "Mapped Packet {'content': PosixPath('../examples/dataset1/day2.txt')} with tag {'day': 'day2'}\n", + "Mapped Packet {'content': PosixPath('../examples/dataset1/day3.txt')} with tag {'day': 'day3'}\n", + "Mapped Packet {'content': PosixPath('../examples/dataset1/day4.txt')} with tag {'day': 'day4'}\n", + "Mapped Packet {'content': PosixPath('../examples/dataset1/day6.txt')} with tag {'day': 'day6'}\n" + ] + } + ], + "source": [ + "packet_mapper = ob.MapPackets(key_map={\"txt_file\": \"content\"})\n", + "key_mapped_stream = packet_mapper(dataset1)\n", + "\n", + "tag_mapper = ob.MapTags(key_map={\"file_name\": \"day\"})\n", + "tag_and_packet_mapped = tag_mapper(key_mapped_stream)\n", + "\n", + "for tag, packet in tag_and_packet_mapped:\n", + " print(f\"Mapped Packet {packet} with tag {tag}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It's worth emphasizing again that all computations are triggered only when you iterate through the final stream `tag_and_key_mapped`" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Although not recommended as it reduces readability, you can create and immediately apply `mapper` to achieve the same processing in a fewer lines of code (albeit, with worse readability):" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Mapped Packet {'content': PosixPath('../examples/dataset1/day1.txt')} with tag {'day': 'day1'}\n", + "Mapped Packet {'content': PosixPath('../examples/dataset1/day2.txt')} with tag {'day': 'day2'}\n", + "Mapped Packet {'content': PosixPath('../examples/dataset1/day3.txt')} with tag {'day': 'day3'}\n", + "Mapped Packet {'content': PosixPath('../examples/dataset1/day4.txt')} with tag {'day': 'day4'}\n", + "Mapped Packet {'content': PosixPath('../examples/dataset1/day6.txt')} with tag {'day': 'day6'}\n" + ] + } + ], + "source": [ + "# totally valid, but difficult to read and thus not recommended\n", + "for tag, packet in ob.MapTags(key_map={\"file_name\": \"day\"})(\n", + " ob.MapPackets(key_map={\"txt_file\": \"content\"})(dataset1)\n", + "):\n", + " print(f\"Mapped Packet {packet} with tag {tag}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Joining multiple streams into a single stream\n", + "Now that we have looked at how you can manipulate a single stream, let's turn our eyes to how you can work with more than one streams together.\n", + "\n", + "By the far the most common multi-stream operations will be to join two (or more) streams into a single, bigger stream. \n", + "You can combine multiple streams into one by using `Join` operation, matching packets from each stream based on the matching tags. If tags from two streams have shared key, the value must be identical for all shared keys for the two packets to be matched. The matched packets are then merged into a one (typically larger) packet and shipped to the output stream." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's see what happens if we join `dataset1` and `dataset2`, where:" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dataset 1:\n", + "Tag: {'file_name': 'day1'}, Packet: {'txt_file': PosixPath('../examples/dataset1/day1.txt')}\n", + "Tag: {'file_name': 'day2'}, Packet: {'txt_file': PosixPath('../examples/dataset1/day2.txt')}\n", + "Tag: {'file_name': 'day3'}, Packet: {'txt_file': PosixPath('../examples/dataset1/day3.txt')}\n", + "Tag: {'file_name': 'day4'}, Packet: {'txt_file': PosixPath('../examples/dataset1/day4.txt')}\n", + "Tag: {'file_name': 'day6'}, Packet: {'txt_file': PosixPath('../examples/dataset1/day6.txt')}\n", + "\n", + "Dataset 2:\n", + "Tag: {'file_name': 'session_day1'}, Packet: {'bin_data': PosixPath('../examples/dataset2/session_day1.bin')}\n", + "Tag: {'file_name': 'session_day3'}, Packet: {'bin_data': PosixPath('../examples/dataset2/session_day3.bin')}\n", + "Tag: {'file_name': 'session_day4'}, Packet: {'bin_data': PosixPath('../examples/dataset2/session_day4.bin')}\n", + "Tag: {'file_name': 'session_day5'}, Packet: {'bin_data': PosixPath('../examples/dataset2/session_day5.bin')}\n" + ] + } + ], + "source": [ + "# dataset 1\n", + "print(\"Dataset 1:\")\n", + "for tag, packet in dataset1:\n", + " print(f\"Tag: {tag}, Packet: {packet}\")\n", + "\n", + "# dataset 2\n", + "print(\"\\nDataset 2:\")\n", + "for tag, packet in dataset2:\n", + " print(f\"Tag: {tag}, Packet: {packet}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Any guess what would happen?" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "join_op = ob.Join()\n", + "\n", + "for tag, packet in join_op(dataset1, dataset2):\n", + " print(f\"Tag: {tag}, Packet: {packet}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You may be surprised to see that the joined stream is completely empty! This is because packets from both streams were tagged with key `file_name`, causing the `Join` to combine packets only if the value of `file_name` matches exactly. Since no filenames matched, the resulting stream was empty!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This is where we can make use of the other `mappers` to our advantage and achieve more useful join." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First, let's completely rename the tag key for one of the streams and see what would happen." + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "01 Tag: {'day': 'day1', 'file_name': 'session_day1'}, Packet: {'txt_file': PosixPath('../examples/dataset1/day1.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day1.bin')}\n", + "02 Tag: {'day': 'day1', 'file_name': 'session_day3'}, Packet: {'txt_file': PosixPath('../examples/dataset1/day1.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day3.bin')}\n", + "03 Tag: {'day': 'day1', 'file_name': 'session_day4'}, Packet: {'txt_file': PosixPath('../examples/dataset1/day1.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day4.bin')}\n", + "04 Tag: {'day': 'day1', 'file_name': 'session_day5'}, Packet: {'txt_file': PosixPath('../examples/dataset1/day1.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day5.bin')}\n", + "05 Tag: {'day': 'day2', 'file_name': 'session_day1'}, Packet: {'txt_file': PosixPath('../examples/dataset1/day2.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day1.bin')}\n", + "06 Tag: {'day': 'day2', 'file_name': 'session_day3'}, Packet: {'txt_file': PosixPath('../examples/dataset1/day2.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day3.bin')}\n", + "07 Tag: {'day': 'day2', 'file_name': 'session_day4'}, Packet: {'txt_file': PosixPath('../examples/dataset1/day2.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day4.bin')}\n", + "08 Tag: {'day': 'day2', 'file_name': 'session_day5'}, Packet: {'txt_file': PosixPath('../examples/dataset1/day2.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day5.bin')}\n", + "09 Tag: {'day': 'day3', 'file_name': 'session_day1'}, Packet: {'txt_file': PosixPath('../examples/dataset1/day3.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day1.bin')}\n", + "10 Tag: {'day': 'day3', 'file_name': 'session_day3'}, Packet: {'txt_file': PosixPath('../examples/dataset1/day3.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day3.bin')}\n", + "11 Tag: {'day': 'day3', 'file_name': 'session_day4'}, Packet: {'txt_file': PosixPath('../examples/dataset1/day3.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day4.bin')}\n", + "12 Tag: {'day': 'day3', 'file_name': 'session_day5'}, Packet: {'txt_file': PosixPath('../examples/dataset1/day3.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day5.bin')}\n", + "13 Tag: {'day': 'day4', 'file_name': 'session_day1'}, Packet: {'txt_file': PosixPath('../examples/dataset1/day4.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day1.bin')}\n", + "14 Tag: {'day': 'day4', 'file_name': 'session_day3'}, Packet: {'txt_file': PosixPath('../examples/dataset1/day4.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day3.bin')}\n", + "15 Tag: {'day': 'day4', 'file_name': 'session_day4'}, Packet: {'txt_file': PosixPath('../examples/dataset1/day4.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day4.bin')}\n", + "16 Tag: {'day': 'day4', 'file_name': 'session_day5'}, Packet: {'txt_file': PosixPath('../examples/dataset1/day4.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day5.bin')}\n", + "17 Tag: {'day': 'day6', 'file_name': 'session_day1'}, Packet: {'txt_file': PosixPath('../examples/dataset1/day6.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day1.bin')}\n", + "18 Tag: {'day': 'day6', 'file_name': 'session_day3'}, Packet: {'txt_file': PosixPath('../examples/dataset1/day6.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day3.bin')}\n", + "19 Tag: {'day': 'day6', 'file_name': 'session_day4'}, Packet: {'txt_file': PosixPath('../examples/dataset1/day6.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day4.bin')}\n", + "20 Tag: {'day': 'day6', 'file_name': 'session_day5'}, Packet: {'txt_file': PosixPath('../examples/dataset1/day6.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day5.bin')}\n" + ] + } + ], + "source": [ + "dataset1_retagged = ob.MapTags(key_map={\"file_name\": \"day\"})(dataset1)\n", + "\n", + "for i, (tag, packet) in enumerate(join_op(dataset1_retagged, dataset2)):\n", + " print(f\"{i + 1:02d} Tag: {tag}, Packet: {packet}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We are now getting something -- in fact, quite a few things. If you look carefully at the `packet`, you'll notice that it now contains two keys/arguments -- `txt_file` and `bin_data`, combining the packets from the two datasets. \n", + "\n", + "The `tags` also now contain two keys `day` from the re-tagged dataset1 stream and `file_name` from unchanged dataset2 stream.\n", + "\n", + "Since the two streams share no common tags, the `Join` operation results in *full-multiplexing* of two streams. With the streams from dataset1 and dataset2 containing 5 packet and 4 packets, respectively, you get $5 \\times 4 = 20$ packets" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "However, it is not all too useful if all `Join` can do is to produce either 0 packet or a full combination of packets from two streams. The true value of `Join` lies in its ability to match two packets that are *related* to each other. \n", + "\n", + "In our example datasets, you likely noticed that files from both datasets are associated with a day. Let's now try to join the two dataset streams by matching by the day!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Although we could achieve the desired effect by changing how we load the source, passing in custom `tag_function` into `GlobSource`, let's achieve the same by using another `mapper` called `Transform`. `Transform` effectively combines `MapKey` and `MapTag` but further allows you to provide a function that will receive the tag and packet, one at a time, and return a (potentially modified) tag and/or packet, achieving the desired transformation." + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tag: {'day': 'day1'}, Packet: {'bin_data': PosixPath('../examples/dataset2/session_day1.bin')}\n", + "Tag: {'day': 'day3'}, Packet: {'bin_data': PosixPath('../examples/dataset2/session_day3.bin')}\n", + "Tag: {'day': 'day4'}, Packet: {'bin_data': PosixPath('../examples/dataset2/session_day4.bin')}\n", + "Tag: {'day': 'day5'}, Packet: {'bin_data': PosixPath('../examples/dataset2/session_day5.bin')}\n" + ] + } + ], + "source": [ + "def transform_dataset2(tag, packet):\n", + " # Extract the second half of the filename containing day\n", + " new_tag = {\"day\": tag[\"file_name\"].split(\"_\")[1]}\n", + " return new_tag, packet\n", + "\n", + "\n", + "# Speical mappers like transform can be found in the orcabridge.mapper module\n", + "dataset2_transformer = ob.mapper.Transform(transform_dataset2)\n", + "\n", + "retagged_dataset2 = dataset2_transformer(dataset2)\n", + "\n", + "for tag, packet in retagged_dataset2:\n", + " print(f\"Tag: {tag}, Packet: {packet}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we have dataset2 packets tagged with `day`, let's `join`` with a mapped dataset1!" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tag: {'day': 'day1'}, Packet: {'txt_file': PosixPath('../examples/dataset1/day1.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day1.bin')}\n", + "Tag: {'day': 'day3'}, Packet: {'txt_file': PosixPath('../examples/dataset1/day3.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day3.bin')}\n", + "Tag: {'day': 'day4'}, Packet: {'txt_file': PosixPath('../examples/dataset1/day4.txt'), 'bin_data': PosixPath('../examples/dataset2/session_day4.bin')}\n" + ] + } + ], + "source": [ + "# change filename to day for dataset1\n", + "tag_mapper = ob.MapTags(key_map={\"file_name\": \"day\"})\n", + "retagged_dataset1 = tag_mapper(dataset1)\n", + "\n", + "join_op = ob.Join()\n", + "joined_stream = join_op(retagged_dataset1, retagged_dataset2)\n", + "\n", + "for tag, packet in joined_stream:\n", + " print(f\"Tag: {tag}, Packet: {packet}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Nice! We have now formed a stream where packets from two streams are paired meaningfully based on matching `day`!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we have explored quite a bit on how to manipulate data stream using `mapper` operations, it's time to turn to the other half ot he operations: `pods`" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Introducing new files into stream with `Pod`" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "While `mapper` operations are useful in altering tags, packets, and in combining multiple streams, a data pipeline is not really useful if it cannot produce new resultsin the form of new data -- that is, introduce new files into the stream. This is precisely where `Pod` operations come in!\n", + "\n", + "In fact, we have already been working with a `pod` all along -- `sources`. If you think about it, `sources` also introduce files into the stream. It is just special in that it takes no input streams (hence the name, `source`).\n", + "\n", + "We now will explore how you can create a more common type of pod -- a *function* `pod` that takes in a stream and return a new stream potentially introducing entirely new data file!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Working with `FunctionPod`" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The easiest way to create a function-like `pod` is to create a `FunctionPod`, passing in a Python function. Let's start by creating a pod that will count the number of lines in a file.\n", + "\n", + "We first define the function." + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "from os import PathLike\n", + "\n", + "\n", + "def count_lines(txt_file: PathLike) -> None:\n", + " with open(txt_file, \"r\") as f:\n", + " lines = f.readlines()\n", + " print(f\"File {txt_file} has {len(lines)} lines.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next we instantiate a function pod from the function." + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [], + "source": [ + "# create a function pod\n", + "function_pod = ob.FunctionPod(count_lines, output_keys=[])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once function pod is available, you can execute it on any compatible stream" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "File ../examples/dataset1/day1.txt has 24 lines.\n", + "Tag: {'file_name': 'day1'}, Packet: {}\n", + "File ../examples/dataset1/day2.txt has 15 lines.\n", + "Tag: {'file_name': 'day2'}, Packet: {}\n", + "File ../examples/dataset1/day3.txt has 27 lines.\n", + "Tag: {'file_name': 'day3'}, Packet: {}\n", + "File ../examples/dataset1/day4.txt has 22 lines.\n", + "Tag: {'file_name': 'day4'}, Packet: {}\n", + "File ../examples/dataset1/day6.txt has 22 lines.\n", + "Tag: {'file_name': 'day6'}, Packet: {}\n" + ] + } + ], + "source": [ + "# apply the function pod on a stream\n", + "processed_stream = function_pod(dataset1)\n", + "\n", + "for tag, packet in processed_stream:\n", + " print(f\"Tag: {tag}, Packet: {packet}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Notice that the returned `packet` is empty because the function returns no values. Such a function pod may still be useful for achieving computations/processing via *side effects* (e.g., submitting HTTP requests in the function body)l, but it is not the standard approach in performing computations where you'd want the results to persis.\n", + "\n", + "Next, let's see how to achieve more common scenario where you perform some computation and you now would like to save the result into a file. Dataset2 binary actually contains a list of floats values. Let's define a function to compute a few statistics and save them to a file in a temporary directory." + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import tempfile\n", + "\n", + "import numpy as np\n", + "\n", + "\n", + "def compute_stats(bin_file: PathLike, output_file=None):\n", + " print(\"Computing stats for file:\", bin_file)\n", + " # create a temporary file to store the status and return the file path\n", + " with open(bin_file, \"rb\") as f:\n", + " data = f.read()\n", + " data = np.frombuffer(data)\n", + " print(data)\n", + " data_stats = {}\n", + " data_stats[\"mean\"] = np.mean(data)\n", + " data_stats[\"std\"] = np.std(data)\n", + " data_stats[\"min\"] = np.min(data)\n", + " data_stats[\"max\"] = np.max(data)\n", + " data_stats[\"n_elements\"] = len(data)\n", + "\n", + " # if output_file is none, create a temporary file. Else, use the given output_file to save the data_stats\n", + " if output_file is None:\n", + " output_file = Path(tempfile.mkdtemp()) / \"statistics.json\"\n", + " # write as json\n", + " with open(output_file, \"w\") as f:\n", + " json.dump(data_stats, f)\n", + " return output_file" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Computing stats for file: ../examples/dataset2/session_day1.bin\n", + "[-1.08209134 -0.66806394 0.42870206 -0.09321731 -3.14078305 1.33520433\n", + " 1.11085152 1.31931842 -1.19915697 0.07701737 1.30020807 0.27541194\n", + " 0.84430062 0.18236837 -0.83039631 -1.66166191 0.8720775 -1.72170657\n", + " -0.01962253 -0.18050553 1.35478472 0.69928177 0.7314272 -0.06915687\n", + " -0.08364667 -0.45551653 0.70752188 1.02283734 -0.18612795 0.8767394\n", + " -1.542636 1.04685484 -2.1311672 -1.34874222 0.61977577 -0.33880262\n", + " 0.6624482 0.60257325 -3.04901544 -0.20685843 -0.08997232 0.88932232]\n", + "Tag: {'file_name': 'session_day1'}, Packet: {'stats': PosixPath('/tmp/tmpm2wka6il/statistics.json')}\n", + "Computing stats for file: ../examples/dataset2/session_day3.bin\n", + "[ 0.56114059 -1.34902274 1.0665563 0.71890802 0.65244834 1.04369548\n", + " 0.54872876 2.19365207 0.53864286 -1.44108823 -0.55651539 0.1603561\n", + " -0.93869224 0.64645323 -1.08815337 1.40972393 -0.14662931 1.34692375\n", + " 0.38400938 -1.23004316 1.34426647 -0.07620065 -0.91983972 0.23537101\n", + " 0.91515395 0.8064348 0.81470895 -1.04466683 -0.25893558 -1.46253167\n", + " 1.39972807 -0.13940519]\n", + "Tag: {'file_name': 'session_day3'}, Packet: {'stats': PosixPath('/tmp/tmplkmx65ll/statistics.json')}\n", + "Computing stats for file: ../examples/dataset2/session_day4.bin\n", + "[ 0.70078854 1.18137906 -0.44361437 -0.389409 0.29719038 0.2523247\n", + " -0.97418716 0.49301127 0.07900351 -0.29965042 -0.25810762 -2.78777445\n", + " -1.24321702 0.13011593 1.07826637 -0.33177479 -0.78337033 -1.30075356\n", + " -0.15710138 0.51927589 0.08671884 0.02058063 0.20778149 -1.40382559\n", + " -0.69978105 -1.10525753 0.1945444 0.82623748 0.17467868]\n", + "Tag: {'file_name': 'session_day4'}, Packet: {'stats': PosixPath('/tmp/tmpxajrzctd/statistics.json')}\n", + "Computing stats for file: ../examples/dataset2/session_day5.bin\n", + "[ 1.9125739 -0.05252076 0.33347618 0.31627214 0.47141153 -0.71088615\n", + " -0.74745805 0.53959117 -0.14395142 -0.28713782 -0.29422236 -1.00231383\n", + " 0.69566576 -0.25895608 -0.9660761 -0.78504297 -1.91668262 0.89452296\n", + " -0.82748688 -0.19792482 0.07305616 0.36133414 1.7164791 0.64364619\n", + " -0.73146429 0.96324864 -1.05981222 -0.59502066 0.15084192]\n", + "Tag: {'file_name': 'session_day5'}, Packet: {'stats': PosixPath('/tmp/tmp67rthfe1/statistics.json')}\n" + ] + } + ], + "source": [ + "fp_stats = ob.FunctionPod(compute_stats, output_keys=[\"stats\"])\n", + "\n", + "# change the key from 'bin_data' to 'bin_file', matching the function's input\n", + "mapped_dataset2 = ob.MapPackets(key_map={\"bin_data\": \"bin_file\"})(dataset2)\n", + "\n", + "for tag, packet in fp_stats(mapped_dataset2):\n", + " print(f\"Tag: {tag}, Packet: {packet}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that in our function `compute_stats`, the computed stats are saved as `json` file into a temporary file. While this works to pass data from one to another within the pipeline, the result cannot be easily retrieved outside of the immediate usage. In fact, the computation result is very likely to disappear in some time (afterall, it's a temporary file). In fact, if you were to execute the same computation by iterating the second time over `stats_stream`, you will see that it invokes the functions yet again, and produces an entirely different set of temporary files. Since the content of computation didn't change, this is cearly quite wasteful!" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Computing stats for file: ../examples/dataset2/session_day1.bin\n", + "[-1.08209134 -0.66806394 0.42870206 -0.09321731 -3.14078305 1.33520433\n", + " 1.11085152 1.31931842 -1.19915697 0.07701737 1.30020807 0.27541194\n", + " 0.84430062 0.18236837 -0.83039631 -1.66166191 0.8720775 -1.72170657\n", + " -0.01962253 -0.18050553 1.35478472 0.69928177 0.7314272 -0.06915687\n", + " -0.08364667 -0.45551653 0.70752188 1.02283734 -0.18612795 0.8767394\n", + " -1.542636 1.04685484 -2.1311672 -1.34874222 0.61977577 -0.33880262\n", + " 0.6624482 0.60257325 -3.04901544 -0.20685843 -0.08997232 0.88932232]\n", + "Tag: {'file_name': 'session_day1'}, Packet: {'stats': PosixPath('/tmp/tmpciwa2xl_/statistics.json')}\n", + "Computing stats for file: ../examples/dataset2/session_day3.bin\n", + "[ 0.56114059 -1.34902274 1.0665563 0.71890802 0.65244834 1.04369548\n", + " 0.54872876 2.19365207 0.53864286 -1.44108823 -0.55651539 0.1603561\n", + " -0.93869224 0.64645323 -1.08815337 1.40972393 -0.14662931 1.34692375\n", + " 0.38400938 -1.23004316 1.34426647 -0.07620065 -0.91983972 0.23537101\n", + " 0.91515395 0.8064348 0.81470895 -1.04466683 -0.25893558 -1.46253167\n", + " 1.39972807 -0.13940519]\n", + "Tag: {'file_name': 'session_day3'}, Packet: {'stats': PosixPath('/tmp/tmpkq824j5b/statistics.json')}\n", + "Computing stats for file: ../examples/dataset2/session_day4.bin\n", + "[ 0.70078854 1.18137906 -0.44361437 -0.389409 0.29719038 0.2523247\n", + " -0.97418716 0.49301127 0.07900351 -0.29965042 -0.25810762 -2.78777445\n", + " -1.24321702 0.13011593 1.07826637 -0.33177479 -0.78337033 -1.30075356\n", + " -0.15710138 0.51927589 0.08671884 0.02058063 0.20778149 -1.40382559\n", + " -0.69978105 -1.10525753 0.1945444 0.82623748 0.17467868]\n", + "Tag: {'file_name': 'session_day4'}, Packet: {'stats': PosixPath('/tmp/tmp7ii2nd6e/statistics.json')}\n", + "Computing stats for file: ../examples/dataset2/session_day5.bin\n", + "[ 1.9125739 -0.05252076 0.33347618 0.31627214 0.47141153 -0.71088615\n", + " -0.74745805 0.53959117 -0.14395142 -0.28713782 -0.29422236 -1.00231383\n", + " 0.69566576 -0.25895608 -0.9660761 -0.78504297 -1.91668262 0.89452296\n", + " -0.82748688 -0.19792482 0.07305616 0.36133414 1.7164791 0.64364619\n", + " -0.73146429 0.96324864 -1.05981222 -0.59502066 0.15084192]\n", + "Tag: {'file_name': 'session_day5'}, Packet: {'stats': PosixPath('/tmp/tmpz23q61gg/statistics.json')}\n" + ] + } + ], + "source": [ + "# everytime you run the following loop, new computations are performed and\n", + "# saved in a different set of temporary files\n", + "for tag, packet in fp_stats(mapped_dataset2):\n", + " print(f\"Tag: {tag}, Packet: {packet}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the next section we will see how we can have the computation restuls stored using storage-backed function pods." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### [Technical aside] Caching stream" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**NOTE**: This section concerns an implementation detail of `Oracbridge` that is not fundamentally related to the design of the system. In particular, the issue described in this section (and the associated *solution*) is not relevant to the full-implementation that `Orcapod` will be. If you are reading this document primarily to understand the concepts essential to Orcapod, you are advised to skip this section entirely. However, if you intend to make use of `oracabridge` in an actual application, read on to learn critical limitations associated with single-producer single-consumer (SPSC) design of the `orcabridge` and how you can ameloiorate this using `CacheStream` mapper effectively within your pipeline." + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "HashableMixin.__hash__ called on CacheStream instance without identity_structure() implementation. Falling back to super().__hash__() which is not stable across sessions.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Computing stats for file: ../examples/dataset2/session_day1.bin\n", + "[-1.08209134 -0.66806394 0.42870206 -0.09321731 -3.14078305 1.33520433\n", + " 1.11085152 1.31931842 -1.19915697 0.07701737 1.30020807 0.27541194\n", + " 0.84430062 0.18236837 -0.83039631 -1.66166191 0.8720775 -1.72170657\n", + " -0.01962253 -0.18050553 1.35478472 0.69928177 0.7314272 -0.06915687\n", + " -0.08364667 -0.45551653 0.70752188 1.02283734 -0.18612795 0.8767394\n", + " -1.542636 1.04685484 -2.1311672 -1.34874222 0.61977577 -0.33880262\n", + " 0.6624482 0.60257325 -3.04901544 -0.20685843 -0.08997232 0.88932232]\n", + "Tag: {'file_name': 'session_day1'}, Packet: {'stats': PosixPath('/tmp/tmpukvddhuv/statistics.json')}\n", + "Computing stats for file: ../examples/dataset2/session_day3.bin\n", + "[ 0.56114059 -1.34902274 1.0665563 0.71890802 0.65244834 1.04369548\n", + " 0.54872876 2.19365207 0.53864286 -1.44108823 -0.55651539 0.1603561\n", + " -0.93869224 0.64645323 -1.08815337 1.40972393 -0.14662931 1.34692375\n", + " 0.38400938 -1.23004316 1.34426647 -0.07620065 -0.91983972 0.23537101\n", + " 0.91515395 0.8064348 0.81470895 -1.04466683 -0.25893558 -1.46253167\n", + " 1.39972807 -0.13940519]\n", + "Tag: {'file_name': 'session_day3'}, Packet: {'stats': PosixPath('/tmp/tmpat3rm4dk/statistics.json')}\n", + "Computing stats for file: ../examples/dataset2/session_day4.bin\n", + "[ 0.70078854 1.18137906 -0.44361437 -0.389409 0.29719038 0.2523247\n", + " -0.97418716 0.49301127 0.07900351 -0.29965042 -0.25810762 -2.78777445\n", + " -1.24321702 0.13011593 1.07826637 -0.33177479 -0.78337033 -1.30075356\n", + " -0.15710138 0.51927589 0.08671884 0.02058063 0.20778149 -1.40382559\n", + " -0.69978105 -1.10525753 0.1945444 0.82623748 0.17467868]\n", + "Tag: {'file_name': 'session_day4'}, Packet: {'stats': PosixPath('/tmp/tmpuj3tiu8k/statistics.json')}\n", + "Computing stats for file: ../examples/dataset2/session_day5.bin\n", + "[ 1.9125739 -0.05252076 0.33347618 0.31627214 0.47141153 -0.71088615\n", + " -0.74745805 0.53959117 -0.14395142 -0.28713782 -0.29422236 -1.00231383\n", + " 0.69566576 -0.25895608 -0.9660761 -0.78504297 -1.91668262 0.89452296\n", + " -0.82748688 -0.19792482 0.07305616 0.36133414 1.7164791 0.64364619\n", + " -0.73146429 0.96324864 -1.05981222 -0.59502066 0.15084192]\n", + "Tag: {'file_name': 'session_day5'}, Packet: {'stats': PosixPath('/tmp/tmp6yohu0pw/statistics.json')}\n" + ] + } + ], + "source": [ + "# create a cache stream operation\n", + "cache_stream = ob.mapper.CacheStream()\n", + "# change the key from 'bin_data' to 'bin_file', matching the function's input\n", + "mapped_dataset2 = ob.MapPackets(key_map={\"bin_data\": \"bin_file\"})(dataset2)\n", + "stats_stream = fp_stats(mapped_dataset2)\n", + "\n", + "# now cache the stream\n", + "cached_stream = cache_stream(stats_stream)\n", + "\n", + "# iterate over the cached stream\n", + "for tag, packet in cached_stream:\n", + " print(f\"Tag: {tag}, Packet: {packet}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The first time we iterate over the `cached_stream`, you see that the function `compute_stats` is getting executed as we'd expect. However, it's when running it the second time you'd notice something is different." + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tag: {'file_name': 'session_day1'}, Packet: {'stats': PosixPath('/tmp/tmpukvddhuv/statistics.json')}\n", + "Tag: {'file_name': 'session_day3'}, Packet: {'stats': PosixPath('/tmp/tmpat3rm4dk/statistics.json')}\n", + "Tag: {'file_name': 'session_day4'}, Packet: {'stats': PosixPath('/tmp/tmpuj3tiu8k/statistics.json')}\n", + "Tag: {'file_name': 'session_day5'}, Packet: {'stats': PosixPath('/tmp/tmp6yohu0pw/statistics.json')}\n" + ] + } + ], + "source": [ + "for tag, packet in cached_stream:\n", + " print(f\"Tag: {tag}, Packet: {packet}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Since the output packets from `stats_stream` have been cached, iterating through `cached_stream` for the second time simply returned the cached packets without causing new computation. Although this may sound like a good way to prevent recomputing the same thing more than once, `CacheStream` comes with significant demerits. Since all observed packets are stored in memory, having too many `CacheStream` in the pipeline may be very memory resource heavy. Also, unlike store-backed function, as we'll see shortly, `CacheStream` stores the packets as seen from one iteration of the underlying stream. If the underlying stream would have produced new and diffirent packets (e.g., because additional `bin` files are added to the dataset), `CacheStream` won't be able to update itself without you explicitly clearing the cache. Finally, unlike storage backed function pod, computation is *not memoized* and thus same exact computation may still take place if two or more packets are identical in the content and thus would have yielded identical output." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Using storage-backed function pod" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Although the simple `FunctionPod` worked as expected, it's lack of ability to store computation results significantly limits its utility. You certainly wouldn't want to be computing everything from scratch if it can be avoided.\n", + "\n", + "The good news is that you can easily equip a function pod with an ability to store and retrieve previously stored packets. All you have to do is create an instance of `DataStore` and pass it in at the construction of the `FunctionPod`.\n", + "\n", + "Here we are going to configure and use `DirDataStore` where all `packets` and output `packet` contents are stored in a designated directory." + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [], + "source": [ + "data_store = ob.DirDataStore(\"./pod_data\")" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [], + "source": [ + "# use default storage directory of './pod_data'. You could specify a different directory by passing `store_dir` argument\n", + "fp_stats_stored = ob.FunctionPod(\n", + " compute_stats, output_keys=[\"stats\"], data_store=data_store\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now your `FunctionPod` is equipped with an ability to store and retrieve stored packets!" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tag: {'file_name': 'session_day1'}, Packet: {'stats': 'pod_data/compute_stats/15da3b08791f51d9/sha256-c63c34c5fefaa0f2e9bba3edcf6c861c/statistics.json'}\n", + "Tag: {'file_name': 'session_day3'}, Packet: {'stats': 'pod_data/compute_stats/15da3b08791f51d9/sha256-9dfef842f88463f5145ab0d4c06e3938/statistics.json'}\n", + "Tag: {'file_name': 'session_day4'}, Packet: {'stats': 'pod_data/compute_stats/15da3b08791f51d9/sha256-4f3dfa71356fe8f226be66aa8dffbc55/statistics.json'}\n", + "Tag: {'file_name': 'session_day5'}, Packet: {'stats': 'pod_data/compute_stats/15da3b08791f51d9/sha256-26bffc293c82e14cde904274e0c63afd/statistics.json'}\n" + ] + } + ], + "source": [ + "for tag, packet in fp_stats_stored(mapped_dataset2):\n", + " print(f\"Tag: {tag}, Packet: {packet}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As before, the very first time you run, all computations take place. Now watch what happens when you run it again." + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tag: {'file_name': 'session_day1'}, Packet: {'stats': 'pod_data/compute_stats/15da3b08791f51d9/sha256-c63c34c5fefaa0f2e9bba3edcf6c861c/statistics.json'}\n", + "Tag: {'file_name': 'session_day3'}, Packet: {'stats': 'pod_data/compute_stats/15da3b08791f51d9/sha256-9dfef842f88463f5145ab0d4c06e3938/statistics.json'}\n", + "Tag: {'file_name': 'session_day4'}, Packet: {'stats': 'pod_data/compute_stats/15da3b08791f51d9/sha256-4f3dfa71356fe8f226be66aa8dffbc55/statistics.json'}\n", + "Tag: {'file_name': 'session_day5'}, Packet: {'stats': 'pod_data/compute_stats/15da3b08791f51d9/sha256-26bffc293c82e14cde904274e0c63afd/statistics.json'}\n" + ] + } + ], + "source": [ + "for tag, packet in fp_stats_stored(mapped_dataset2):\n", + " print(f\"Tag: {tag}, Packet: {packet}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Notice that this time, the function `compute_stats` was **not** invoked. Rather the computation results from the previous run were *memoized* and *retrieved*, sparing us the unecessary computation!" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "orcabridge", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/02_orcabridge_basic_usage.ipynb b/notebooks/02_orcapod_basic_usage.ipynb similarity index 100% rename from notebooks/02_orcabridge_basic_usage.ipynb rename to notebooks/02_orcapod_basic_usage.ipynb diff --git a/notebooks/03_orcabridge_qol_features.ipynb b/notebooks/03_orcacapod_qol_features.ipynb similarity index 100% rename from notebooks/03_orcabridge_qol_features.ipynb rename to notebooks/03_orcacapod_qol_features.ipynb diff --git a/notebooks/04_orcabridge_tracker.ipynb b/notebooks/04_orcapod_tracker.ipynb similarity index 100% rename from notebooks/04_orcabridge_tracker.ipynb rename to notebooks/04_orcapod_tracker.ipynb From fa8eaa1228fcb2f8efde599f76fe4f5235773b93 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Thu, 19 Jun 2025 00:37:49 +0000 Subject: [PATCH 4/4] build: update uv.lock --- uv.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/uv.lock b/uv.lock index a886dbe..589ebc2 100644 --- a/uv.lock +++ b/uv.lock @@ -1191,7 +1191,7 @@ wheels = [ ] [[package]] -name = "orcabridge" +name = "orcapod" source = { editable = "." } dependencies = [ { name = "beartype" },