Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# orcabridge
Prototype of Orcapod as implemented in Python with functions
# Orcapod Python
Orcapod's Python library for developing reproducbile scientific pipelines.

## Continuous Integration

Expand Down
2 changes: 1 addition & 1 deletion misc/demo_redis_mocking.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def demonstrate_redis_mocking():
MockConnectionError,
),
):
from orcabridge.hashing.string_cachers import RedisCacher
from orcapod.hashing.string_cachers import RedisCacher

# Create a mock Redis instance
mock_redis = MockRedis()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
"source": [
"Below I define few critical concepts for Orcapod.\n",
"\n",
"* `Data` -- In Orcapod, smallest unit of `data` is a single `file`. Unlike many other computation pipeline system, Orcapod pipeline in principle does **not** operate on `data` that's not a file. In other words, `Oracpod` pipeline will **not** pass a data in memory from one node to another. Consequently, all operations and processing in Orcapod pipeline revolves around `file` (NOTE: this is a particularly strong/restrictive version of Oracpod pipeline. We may consider extending data to things like environment variable and command line arguments)\n",
"* `Data` -- In Orcapod, the smallest unit of `data` is a single `file`. Unlike many other computation pipeline system, Orcapod pipeline in principle does **not** operate on `data` that's not a file. In other words, `Oracpod` pipeline will **not** pass a data in memory from one node to another. Consequently, all operations and processing in Orcapod pipeline revolves around `file` (NOTE: this is a particularly strong/restrictive version of Oracpod pipeline. We may consider extending data to things like environment variable and command line arguments)\n",
"* `Pathset` -- a unit of data that can be passed into a pod. A `pathset` consists of a file, a directory, or a collection of one or more file and directories.\n",
"* `Packet` -- a single concrete instance of key-value pair, mapping packet key to a single `pathset`.\n",
"* `Stream` -- a series of one or more `packets` flowing from a `data producer` to a `data consumer`. In a directed acyclic graph represneing an Orcapod `pipeline`, a `stream` corresponds to a *directed* edge connecting from a data source into a `data consumer` (e.g., `pod`)\n",
Expand Down
2,856 changes: 2,856 additions & 0 deletions notebooks/02_orcapod_basic_usage copy.ipynb

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@ requires = ["setuptools>=64", "wheel", "setuptools-scm>=8"]
build-backend = "setuptools.build_meta"

[project]
name = "orcabridge"
description = "Function-based Oracapod Pipeline implementation in Python"
name = "orcapod"
description = "simple yet powerful pipeline library for building reproducible scientific pipeline"
dynamic = ["version"]
dependencies = [
"xxhash",
Expand All @@ -27,7 +27,7 @@ classifiers = [
]

[project.urls]
Homepage = "https://github.com/walkerlab/orcabridge"
Homepage = "https://github.com/walkerlab/orcapod-python"

[project.optional-dependencies]
redis = ["redis>=6.2.0"]
Expand All @@ -37,7 +37,7 @@ redis = ["redis>=6.2.0"]
where = ["src"]

[tool.setuptools_scm]
version_file = "src/orcabridge/_version.py"
version_file = "src/orcapod/_version.py"

[dependency-groups]
dev = [
Expand Down
File renamed without changes.
File renamed without changes.
6 changes: 3 additions & 3 deletions src/orcabridge/core/base.py → src/orcapod/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@
from typing import Any, TypeVar, Hashable


from orcabridge.hashing import HashableMixin
from orcabridge.types import Packet, Tag, TypeSpec
from orcabridge.utils.stream_utils import get_typespec
from orcapod.hashing import HashableMixin
from orcapod.types import Packet, Tag, TypeSpec
from orcapod.utils.stream_utils import get_typespec

import logging

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@
from typing import Any


from orcabridge.core.base import Operator, SyncStream
from orcabridge.hashing import function_content_hash, hash_function
from orcabridge.core.streams import SyncStreamFromGenerator
from orcabridge.utils.stream_utils import (
from orcapod.core.base import Operator, SyncStream
from orcapod.hashing import function_content_hash, hash_function
from orcapod.core.streams import SyncStreamFromGenerator
from orcapod.utils.stream_utils import (
batch_packet,
batch_tags,
check_packet_compatibility,
Expand All @@ -16,7 +16,7 @@
merge_typespecs,
)

from orcabridge.types import Packet, Tag, TypeSpec
from orcapod.types import Packet, Tag, TypeSpec


class Repeat(Operator):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@
from pathlib import Path
from typing import Any, Literal

from orcabridge.core.base import Source
from orcabridge.hashing import hash_function
from orcabridge.core.streams import SyncStream, SyncStreamFromGenerator
from orcabridge.types import Packet, Tag
from orcapod.core.base import Source
from orcapod.hashing import hash_function
from orcapod.core.streams import SyncStream, SyncStreamFromGenerator
from orcapod.types import Packet, Tag


class GlobSource(Source):
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from collections.abc import Callable, Collection, Iterator

from orcabridge.core.base import SyncStream
from orcabridge.types import Packet, Tag
from orcapod.core.base import SyncStream
from orcapod.types import Packet, Tag


class SyncStreamFromLists(SyncStream):
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from orcabridge.core.base import Invocation, Kernel, Tracker
from orcapod.core.base import Invocation, Kernel, Tracker


class GraphTracker(Tracker):
Expand Down
File renamed without changes.
2 changes: 1 addition & 1 deletion src/orcabridge/dj/mapper.py → src/orcapod/dj/mapper.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import warnings
from typing import Optional

from orcabridge.mappers import Join, MapPackets, Mapper, MapTags
from orcapod.mappers import Join, MapPackets, Mapper, MapTags
from .operation import QueryOperation
from .stream import QueryStream

Expand Down
File renamed without changes.
File renamed without changes.
6 changes: 3 additions & 3 deletions src/orcabridge/dj/source.py → src/orcapod/dj/source.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@
import datajoint as dj
from datajoint import Schema, Table

from orcabridge.hashing import hash_to_uuid
from orcapod.hashing import hash_to_uuid

from orcabridge.sources import Source
from orcabridge.streams import SyncStream
from orcapod.sources import Source
from orcapod.streams import SyncStream
from ..utils.name import pascal_to_snake, snake_to_pascal
from ..utils.stream_utils import common_elements
from .operation import QueryOperation
Expand Down
2 changes: 1 addition & 1 deletion src/orcabridge/dj/stream.py → src/orcapod/dj/stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from datajoint.expression import QueryExpression
from datajoint.table import Table

from orcabridge.streams import SyncStream
from orcapod.streams import SyncStream

logger = logging.getLogger(__name__)

Expand Down
8 changes: 4 additions & 4 deletions src/orcabridge/dj/tracker.py → src/orcapod/dj/tracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@
import networkx as nx
from datajoint import Schema

from orcabridge.base import Operation, Source
from orcabridge.mappers import Mapper, Merge
from orcabridge.pod import FunctionPod
from orcabridge.pipeline import GraphTracker
from orcapod.base import Operation, Source
from orcapod.mappers import Mapper, Merge
from orcapod.pod import FunctionPod
from orcapod.pipeline import GraphTracker

from .mapper import convert_to_query_mapper
from .operation import QueryOperation
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@

import xxhash

from orcabridge.types import Packet, PathSet
from orcabridge.utils.name import find_noncolliding_name
from orcapod.types import Packet, PathSet
from orcapod.utils.name import find_noncolliding_name

# Configure logging with __name__ for proper hierarchy
logger = logging.getLogger(__name__)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
# A collection of utility function that provides a "default" implementation of hashers.
# This is often used as the fallback hasher in the library code.
from orcabridge.hashing.types import CompositeFileHasher, ArrowHasher
from orcabridge.hashing.file_hashers import PathLikeHasherFactory
from orcabridge.hashing.string_cachers import InMemoryCacher
from orcabridge.hashing.object_hashers import ObjectHasher
from orcabridge.hashing.object_hashers import LegacyObjectHasher
from orcabridge.hashing.function_info_extractors import FunctionInfoExtractorFactory
from orcabridge.hashing.semantic_arrow_hasher import SemanticArrowHasher, PathHasher
from orcapod.hashing.types import CompositeFileHasher, ArrowHasher
from orcapod.hashing.file_hashers import PathLikeHasherFactory
from orcapod.hashing.string_cachers import InMemoryCacher
from orcapod.hashing.object_hashers import ObjectHasher
from orcapod.hashing.object_hashers import LegacyObjectHasher
from orcapod.hashing.function_info_extractors import FunctionInfoExtractorFactory
from orcapod.hashing.semantic_arrow_hasher import SemanticArrowHasher, PathHasher


def get_default_composite_file_hasher(with_cache=True) -> CompositeFileHasher:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
from orcabridge.hashing.core import hash_file, hash_pathset, hash_packet
from orcabridge.hashing.types import (
from orcapod.hashing.core import hash_file, hash_pathset, hash_packet
from orcapod.hashing.types import (
FileHasher,
PathSetHasher,
StringCacher,
CompositeFileHasher,
)
from orcabridge.types import Packet, PathLike, PathSet
from orcapod.types import Packet, PathLike, PathSet


# Completely unnecessary to inherit from FileHasher, but this
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from .types import FunctionInfoExtractor
from collections.abc import Callable
from typing import Any, Literal
from orcabridge.types import TypeSpec
from orcapod.types import TypeSpec
import inspect


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from pathlib import Path
from typing import TYPE_CHECKING, Any, TYPE_CHECKING

from orcabridge.hashing.types import StringCacher
from orcapod.hashing.types import StringCacher

logger = logging.getLogger(__name__)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from typing import Any, Protocol, runtime_checkable
import uuid

from orcabridge.types import Packet, PathLike, PathSet, TypeSpec
from orcapod.types import Packet, PathLike, PathSet, TypeSpec

import pyarrow as pa

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@
import networkx as nx
import pandas as pd

from orcabridge.core.base import Invocation, Kernel
from orcabridge.hashing import hash_to_hex
from orcabridge.core.tracker import GraphTracker
from orcapod.core.base import Invocation, Kernel
from orcapod.hashing import hash_to_hex
from orcapod.core.tracker import GraphTracker

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -713,7 +713,7 @@ def validate_pipeline_serializability(pipeline: Pipeline) -> None:

def create_example_pipeline() -> Pipeline:
"""Create an example pipeline for testing"""
from orcabridge import GlobSource, function_pod
from orcapod import GlobSource, function_pod

@function_pod
def example_function(input_file):
Expand Down
File renamed without changes.
20 changes: 10 additions & 10 deletions src/orcabridge/pod/core.py → src/orcapod/pod/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@
Literal,
)

from orcabridge.types.registry import PacketConverter
from orcapod.types.registry import PacketConverter

from orcabridge.core.base import Kernel
from orcabridge.hashing import (
from orcapod.core.base import Kernel
from orcapod.hashing import (
ObjectHasher,
ArrowHasher,
FunctionInfoExtractor,
Expand All @@ -23,18 +23,18 @@
get_default_object_hasher,
get_default_arrow_hasher,
)
from orcabridge.core.operators import Join
from orcabridge.store import DataStore, ArrowDataStore, NoOpDataStore
from orcabridge.core.streams import SyncStream, SyncStreamFromGenerator
from orcabridge.types import Packet, PathSet, PodFunction, Tag, TypeSpec
from orcapod.core.operators import Join
from orcapod.store import DataStore, ArrowDataStore, NoOpDataStore
from orcapod.core.streams import SyncStream, SyncStreamFromGenerator
from orcapod.types import Packet, PathSet, PodFunction, Tag, TypeSpec

from orcabridge.types.default import default_registry
from orcabridge.types.inference import (
from orcapod.types.default import default_registry
from orcapod.types.inference import (
extract_function_data_types,
verify_against_typespec,
check_typespec_compatibility,
)
from orcabridge.types.registry import is_packet_supported
from orcapod.types.registry import is_packet_supported
import polars as pl

logger = logging.getLogger(__name__)
Expand Down
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from dataclasses import dataclass
from datetime import datetime, timedelta
import logging
from orcabridge.store.types import DuplicateError
from orcapod.store.types import DuplicateError

# Module-level logger
logger = logging.getLogger(__name__)
Expand Down
10 changes: 5 additions & 5 deletions src/orcabridge/store/core.py → src/orcapod/store/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@
from os import PathLike
from pathlib import Path

from orcabridge.hashing import hash_packet
from orcabridge.hashing.defaults import get_default_composite_file_hasher
from orcabridge.hashing.types import PacketHasher
from orcabridge.store.types import DataStore
from orcabridge.types import Packet
from orcapod.hashing import hash_packet
from orcapod.hashing.defaults import get_default_composite_file_hasher
from orcapod.hashing.types import PacketHasher
from orcapod.store.types import DataStore
from orcapod.types import Packet

logger = logging.getLogger(__name__)

Expand Down
2 changes: 1 addition & 1 deletion src/orcabridge/store/file.py → src/orcapod/store/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from pathlib import Path
from typing import Callable, Collection, Dict, Optional, Tuple, Union

from orcabridge.types import Packet, PathSet
from orcapod.types import Packet, PathSet


@contextlib.contextmanager
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import os
from pathlib import Path

from orcabridge.types import PathLike
from orcapod.types import PathLike

logger = logging.getLogger(__name__)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ def __init__(

def _get_output_dir(self, function_name, content_hash, packet):
"""Get the output directory for a specific packet"""
from orcabridge.hashing.core import hash_dict
from orcapod.hashing.core import hash_dict

packet_hash = hash_dict(packet)
return self.store_dir / function_name / content_hash / str(packet_hash)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Implements transfer data store that lets you transfer memoized packets between data stores.

from orcabridge.store.types import DataStore
from orcabridge.types import Packet
from orcapod.store.types import DataStore
from orcapod.types import Packet


class TransferDataStore(DataStore):
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from typing import Protocol, runtime_checkable

from orcabridge.types import Tag, Packet
from orcapod.types import Tag, Packet
import pyarrow as pa
import polars as pl

Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from optparse import Values
from typing import Any
import pyarrow as pa
from orcabridge.types import Packet
from orcapod.types import Packet
from .core import TypeHandler, TypeInfo, TypeSpec

# This mapping is expected to be stable
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from collections.abc import Collection, Mapping
from typing import TypeVar, Hashable, Any

from orcabridge.types import Packet, Tag, TypeSpec
from orcapod.types import Packet, Tag, TypeSpec


K = TypeVar("K", bound=Hashable)
Expand Down
Loading