Skip to content

Commit 423f81d

Browse files
authored
Port NParallelText corpus (#247)
1 parent 68cb79d commit 423f81d

10 files changed

+1187
-415
lines changed

machine/corpora/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616
from .memory_stream_container import MemoryStreamContainer
1717
from .memory_text import MemoryText
1818
from .multi_key_ref import MultiKeyRef
19+
from .n_parallel_text_corpus import NParallelTextCorpus
20+
from .n_parallel_text_row import NParallelTextRow
1921
from .parallel_text_corpus import ParallelTextCorpus
2022
from .parallel_text_row import ParallelTextRow
2123
from .paratext_backup_terms_corpus import ParatextBackupTermsCorpus
@@ -121,6 +123,8 @@
121123
"nfkc_normalize",
122124
"nfkd_normalize",
123125
"normalize",
126+
"NParallelTextCorpus",
127+
"NParallelTextRow",
124128
"ParallelTextCorpus",
125129
"ParallelTextRow",
126130
"ParatextBackupTermsCorpus",

machine/corpora/corpus.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,11 @@
55
from ..utils.context_managed_generator import ContextManagedGenerator
66
from .alignment_row import AlignmentRow
77
from .corpora_utils import batch, get_split_indices
8+
from .n_parallel_text_row import NParallelTextRow
89
from .parallel_text_row import ParallelTextRow
910
from .text_row import TextRow
1011

11-
Row = TypeVar("Row", TextRow, ParallelTextRow, AlignmentRow)
12+
Row = TypeVar("Row", TextRow, ParallelTextRow, AlignmentRow, NParallelTextRow)
1213
Item = TypeVar("Item")
1314

1415

machine/corpora/n_parallel_text_corpus.py

Lines changed: 397 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
from abc import ABC, abstractmethod
2+
from typing import Iterable, Sequence
3+
4+
from .corpus import Corpus
5+
from .n_parallel_text_row import NParallelTextRow
6+
from .text_corpus import TextCorpus
7+
8+
9+
class NParallelTextCorpusBase(Corpus[NParallelTextRow], ABC):
10+
11+
@property
12+
@abstractmethod
13+
def n(self) -> int: ...
14+
15+
@property
16+
@abstractmethod
17+
def corpora(self) -> Sequence[TextCorpus]: ...
18+
19+
@abstractmethod
20+
def get_rows(self, text_ids: Iterable[str]) -> Sequence[NParallelTextRow]: ...
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
from typing import Any, Sequence
2+
3+
from .text_row import TextRowFlags
4+
5+
6+
class NParallelTextRow:
7+
def __init__(self, text_id: str, n_refs: Sequence[Sequence[Any]]):
8+
if len([n_ref for n_ref in n_refs if n_ref is not None and len(n_ref) > 0]) == 0:
9+
raise ValueError(f"Refs must be provided but n_refs={n_refs}")
10+
self._text_id = text_id
11+
self._n_refs = n_refs
12+
self._n = len(n_refs)
13+
self.n_segments: Sequence[Sequence[str]] = [[] for _ in range(0, self._n)]
14+
self.n_flags: Sequence[TextRowFlags] = [TextRowFlags.SENTENCE_START for _ in range(0, self._n)]
15+
16+
@property
17+
def text_id(self) -> str:
18+
return self._text_id
19+
20+
@property
21+
def ref(self) -> Any:
22+
return self._n_refs[0][0]
23+
24+
@property
25+
def n_refs(self) -> Sequence[Sequence[Any]]:
26+
return self._n_refs
27+
28+
def is_sentence_start(self, i: int) -> bool:
29+
return TextRowFlags.SENTENCE_START in self.n_flags[i]
30+
31+
def is_in_range(self, i: int) -> bool:
32+
return TextRowFlags.IN_RANGE in self.n_flags[i]
33+
34+
def is_range_start(self, i: int) -> bool:
35+
return TextRowFlags.RANGE_START in self.n_flags[i]
36+
37+
@property
38+
def is_empty(self):
39+
return sum([1 for s in self.n_segments if len(s) == 0]) == 0
40+
41+
def text(self, i: int) -> str:
42+
return " ".join(self.n_segments[i])
43+
44+
def invert(self) -> "NParallelTextRow":
45+
inverted_row = NParallelTextRow(self._text_id, list(reversed(self._n_refs)))
46+
inverted_row.n_flags = list(reversed(self.n_flags))
47+
return inverted_row

0 commit comments

Comments
 (0)