Skip to content

Commit b7c06c8

Browse files
committed
Fix Jupyter notebooks
1 parent e0e3f08 commit b7c06c8

File tree

13 files changed

+178
-118
lines changed

13 files changed

+178
-118
lines changed

README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,6 @@ pip install sil-machine
1414

1515
If you would like to find out more about how to use Machine, check out the tutorial Jupyter notebooks:
1616

17-
- [Tokenization](https://nbviewer.org/github/sillsdev/machine.py/blob/main/samples/tokenization.ipynb)
18-
- [Text Corpora](https://nbviewer.org/github/sillsdev/machine.py/blob/main/samples/corpora.ipynb)
19-
- [Word Alignment](https://nbviewer.org/github/sillsdev/machine.py/blob/main/samples/word_alignment.ipynb)
17+
- [Tokenization](https://githubtocolab.com/sillsdev/machine.py/blob/main/samples/tokenization.ipynb)
18+
- [Text Corpora](https://githubtocolab.com/sillsdev/machine.py/blob/main/samples/corpora.ipynb)
19+
- [Word Alignment](https://githubtocolab.com/sillsdev/machine.py/blob/main/samples/word_alignment.ipynb)

machine/corpora/parallel_text_corpus.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -310,6 +310,11 @@ def filter(self, predicate: Callable[[ParallelTextRow], bool]) -> ParallelTextCo
310310
def filter_by_index(self, predicate: Callable[[ParallelTextRow, int], bool]) -> ParallelTextCorpus:
311311
return _FilterParallelTextCorpus(self, predicate)
312312

313+
def filter_texts(self, text_ids: Optional[Iterable[str]]) -> ParallelTextCorpus:
314+
if text_ids is None:
315+
return self
316+
return _FilterTextsParallelTextCorpus(self, text_ids)
317+
313318
def take(self, count: int) -> ParallelTextCorpus:
314319
return _TakeParallelTextCorpus(self, count)
315320

@@ -553,6 +558,31 @@ def _get_rows(self, text_ids: Optional[Iterable[str]]) -> Generator[ParallelText
553558
yield from islice(rows, self._count)
554559

555560

561+
class _FilterTextsParallelTextCorpus(ParallelTextCorpus):
562+
def __init__(self, corpus: ParallelTextCorpus, text_ids: Iterable[str]) -> None:
563+
self._corpus = corpus
564+
self._text_ids = set(text_ids)
565+
566+
@property
567+
def is_source_tokenized(self) -> bool:
568+
return self._corpus.is_source_tokenized
569+
570+
@property
571+
def is_target_tokenized(self) -> bool:
572+
return self._corpus.is_target_tokenized
573+
574+
def _get_rows(self, text_ids: Optional[Iterable[str]]) -> Generator[ParallelTextRow, None, None]:
575+
with self._corpus.get_rows(
576+
self._text_ids if text_ids is None else self._text_ids.intersection(text_ids)
577+
) as rows:
578+
yield from rows
579+
580+
def count(self, include_empty: bool = True, text_ids: Optional[Iterable[str]] = None) -> int:
581+
return self._corpus.count(
582+
include_empty, self._text_ids if text_ids is None else self._text_ids.intersection(text_ids)
583+
)
584+
585+
556586
class _PandasParallelTextCorpus(ParallelTextCorpus):
557587
def __init__(
558588
self,

machine/corpora/scripture_ref.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,10 @@ def __hash__(self) -> int:
123123
return hash((self.verse_ref, tuple(self.path)))
124124

125125
def __repr__(self) -> str:
126-
return f"{self.verse_ref}/{'/'.join(str(se) for se in self.path)}"
126+
result = str(self.verse_ref)
127+
if len(self.path) > 0:
128+
result += "/" + "/".join(str(se) for se in self.path)
129+
return result
127130

128131

129132
EMPTY_SCRIPTURE_REF = ScriptureRef()

machine/corpora/usfm_parser.py

Lines changed: 1 addition & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -256,7 +256,7 @@ def process_token(self) -> bool:
256256
and pub_verse_num_token.text is not None
257257
and pub_verse_end_token.marker == "vp*"
258258
):
259-
pub_chapter = pub_verse_num_token.text.strip()
259+
pub_verse = pub_verse_num_token.text.strip()
260260
self.state.special_token_count += 3
261261

262262
assert token.data is not None
@@ -425,16 +425,6 @@ def _close_all(self) -> None:
425425
while len(self.state.stack) > 0:
426426
self._close_element()
427427

428-
def _is_study_bible_item_closed(self, start_marker: str, ending_marker: str) -> bool:
429-
for i in range(self.state.index + 1, len(self.state.tokens)):
430-
token = self.state.tokens[i]
431-
if token.marker == ending_marker:
432-
return True
433-
434-
if token.marker == start_marker or token.type in {UsfmTokenType.BOOK, UsfmTokenType.CHAPTER}:
435-
return False
436-
return False
437-
438428
def _determine_unknown_token_type(self) -> UsfmTokenType:
439429
if any(e.type == UsfmElementType.NOTE for e in self.state.stack):
440430
return UsfmTokenType.CHARACTER

machine/corpora/usfm_stylesheet.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import regex as re
55

66
from ..utils.file_utils import detect_encoding
7-
from ..utils.string_utils import parse_integer
7+
from ..utils.string_utils import parse_float, parse_integer
88
from ..utils.typeshed import StrPath
99
from .usfm_tag import UsfmJustification, UsfmStyleAttribute, UsfmStyleType, UsfmTag, UsfmTextProperties, UsfmTextType
1010

@@ -248,16 +248,16 @@ def _parse_tag_entry(tag: UsfmTag, entries: List[Tuple[str, str]], entry_index:
248248
if space_after is not None and space_after >= 0:
249249
tag.space_after = space_after
250250
elif entry_marker == "leftmargin":
251-
left_margin = parse_integer(entry_text)
252-
if left_margin is not None and left_margin >= 0:
251+
left_margin = parse_float(entry_text)
252+
if left_margin is not None:
253253
tag.left_margin = left_margin
254254
elif entry_marker == "rightmargin":
255-
right_margin = parse_integer(entry_text)
256-
if right_margin is not None and right_margin >= 0:
255+
right_margin = parse_float(entry_text)
256+
if right_margin is not None:
257257
tag.right_margin = right_margin
258258
elif entry_marker == "firstlineindent":
259-
first_line_indent = parse_integer(entry_text)
260-
if first_line_indent is not None and first_line_indent >= 0:
259+
first_line_indent = parse_float(entry_text)
260+
if first_line_indent is not None:
261261
tag.first_line_indent = first_line_indent
262262
elif entry_marker == "rank":
263263
if entry_text == "-":

machine/corpora/usfm_tag.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -66,18 +66,18 @@ def __init__(self, marker: str) -> None:
6666
self.description: Optional[str] = None
6767
self.encoding: Optional[str] = None
6868
self.end_marker: Optional[str] = None
69-
self.first_line_indent: int = 0
69+
self.first_line_indent: float = 0
7070
self.font_name: Optional[str] = None
7171
self.font_size: int = 0
7272
self.italic: bool = False
7373
self.justification: UsfmJustification = UsfmJustification.LEFT
74-
self.left_margin: int = 0
74+
self.left_margin: float = 0
7575
self.line_spacing: int = 0
7676
self.name: Optional[str] = None
7777
self.not_repeatable: bool = False
7878
self._occurs_under: Set[str] = set()
7979
self.rank: int = 0
80-
self.right_margin: int = 0
80+
self.right_margin: float = 0
8181
self.small_caps: bool = False
8282
self.space_after: int = 0
8383
self.space_before: int = 0

machine/translation/corpus_ops.py

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
1-
from typing import Callable, Generator, Optional, Union
1+
from typing import Callable, Generator, Iterable, Optional, Union
22

3+
from ..corpora.corpora_utils import batch
34
from ..corpora.parallel_text_corpus import ParallelTextCorpus
45
from ..corpora.parallel_text_row import ParallelTextRow
56
from ..utils.progress_status import ProgressStatus
@@ -48,11 +49,11 @@ def is_source_tokenized(self) -> bool:
4849
def is_target_tokenized(self) -> bool:
4950
return self._corpus.is_target_tokenized
5051

51-
def _get_rows(self) -> Generator[ParallelTextRow, None, None]:
52-
with self._corpus.batch(self._batch_size) as batches:
53-
for batch in batches:
54-
alignments = self._aligner.align_batch(batch)
55-
for row, alignment in zip(batch, alignments):
52+
def _get_rows(self, text_ids: Optional[Iterable[str]] = None) -> Generator[ParallelTextRow, None, None]:
53+
with self._corpus.get_rows(text_ids) as rows:
54+
for row_batch in batch(rows, self._batch_size):
55+
alignments = self._aligner.align_batch(row_batch)
56+
for row, alignment in zip(row_batch, alignments):
5657
known_alignment = WordAlignmentMatrix.from_parallel_text_row(row)
5758
if known_alignment is not None:
5859
known_alignment.priority_symmetrize_with(alignment)
@@ -78,12 +79,12 @@ def is_source_tokenized(self) -> bool:
7879
def is_target_tokenized(self) -> bool:
7980
return self._corpus.is_target_tokenized
8081

81-
def _get_rows(self) -> Generator[ParallelTextRow, None, None]:
82-
with self._corpus.batch(self._batch_size) as batches:
83-
for batch in batches:
82+
def _get_rows(self, text_ids: Optional[Iterable[str]] = None) -> Generator[ParallelTextRow, None, None]:
83+
with self._corpus.get_rows(text_ids) as rows:
84+
for row_batch in batch(rows, self._batch_size):
8485
translations = self._translation_engine.translate_batch(
85-
[r.source_segment if self.is_source_tokenized else r.source_text for r in batch]
86+
[r.source_segment if self.is_source_tokenized else r.source_text for r in row_batch]
8687
)
87-
for row, translation in zip(batch, translations):
88+
for row, translation in zip(row_batch, translations):
8889
row.target_segment = translation.target_tokens
8990
yield row

machine/utils/string_utils.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,13 @@ def parse_integer(s: str) -> Optional[int]:
7272
return None
7373

7474

75+
def parse_float(s: str) -> Optional[float]:
76+
try:
77+
return float(s)
78+
except ValueError:
79+
return None
80+
81+
7582
def has_sentence_ending(s: str) -> bool:
7683
s = s.strip()
7784
for c in reversed(s):

0 commit comments

Comments
 (0)