Skip to content

Commit cec61de

Browse files
authored
Add Paratext/USFM processing tutorial (#130)
- replace "strip_all_text" and "prefer_existing_text" parameters with a single enum parameter
1 parent b7c06c8 commit cec61de

File tree

7 files changed

+462
-22
lines changed

7 files changed

+462
-22
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,3 +17,4 @@ If you would like to find out more about how to use Machine, check out the tutor
1717
- [Tokenization](https://githubtocolab.com/sillsdev/machine.py/blob/main/samples/tokenization.ipynb)
1818
- [Text Corpora](https://githubtocolab.com/sillsdev/machine.py/blob/main/samples/corpora.ipynb)
1919
- [Word Alignment](https://githubtocolab.com/sillsdev/machine.py/blob/main/samples/word_alignment.ipynb)
20+
- [Paratext/USFM Processing](https://githubtocolab.com/sillsdev/machine.py/blob/main/samples/paratext_usfm.ipynb)

machine/corpora/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@
5050
normalize,
5151
unescape_spaces,
5252
)
53-
from .update_usfm_parser_handler import UpdateUsfmParserHandler
53+
from .update_usfm_parser_handler import UpdateUsfmBehavior, UpdateUsfmParserHandler
5454
from .usfm_file_text import UsfmFileText
5555
from .usfm_file_text_corpus import UsfmFileTextCorpus
5656
from .usfm_memory_text import UsfmMemoryText
@@ -125,6 +125,7 @@
125125
"TextRow",
126126
"TextRowFlags",
127127
"unescape_spaces",
128+
"UpdateUsfmBehavior",
128129
"UpdateUsfmParserHandler",
129130
"UsfmAttribute",
130131
"UsfmElementType",

machine/corpora/paratext_project_text_updater_base.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from .paratext_project_settings import ParatextProjectSettings
66
from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase
77
from .scripture_ref import ScriptureRef
8-
from .update_usfm_parser_handler import UpdateUsfmParserHandler
8+
from .update_usfm_parser_handler import UpdateUsfmBehavior, UpdateUsfmParserHandler
99
from .usfm_parser import parse_usfm
1010

1111

@@ -21,17 +21,14 @@ def update_usfm(
2121
book_id: str,
2222
rows: Optional[Sequence[Tuple[Sequence[ScriptureRef], str]]] = None,
2323
full_name: Optional[str] = None,
24-
strip_all_text: bool = False,
25-
prefer_existing_text: bool = True,
24+
behavior: UpdateUsfmBehavior = UpdateUsfmBehavior.PREFER_EXISTING,
2625
) -> Optional[str]:
2726
file_name: str = self._settings.get_book_file_name(book_id)
2827
if not self._exists(file_name):
2928
return None
3029
with self._open(file_name) as sfm_file:
3130
usfm: str = sfm_file.read().decode(self._settings.encoding)
32-
handler = UpdateUsfmParserHandler(
33-
rows, None if full_name is None else f"- {full_name}", strip_all_text, prefer_existing_text
34-
)
31+
handler = UpdateUsfmParserHandler(rows, None if full_name is None else f"- {full_name}", behavior)
3532
try:
3633
parse_usfm(usfm, handler, self._settings.stylesheet, self._settings.versification)
3734
return handler.get_usfm(self._settings.stylesheet)

machine/corpora/update_usfm_parser_handler.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from enum import Enum, auto
12
from typing import List, Optional, Sequence, Tuple, Union
23

34
from .scripture_ref import ScriptureRef
@@ -8,21 +9,25 @@
89
from .usfm_tokenizer import UsfmTokenizer
910

1011

12+
class UpdateUsfmBehavior(Enum):
13+
PREFER_EXISTING = auto()
14+
PREFER_NEW = auto()
15+
STRIP_EXISTING = auto()
16+
17+
1118
class UpdateUsfmParserHandler(ScriptureRefUsfmParserHandler):
1219
def __init__(
1320
self,
1421
rows: Optional[Sequence[Tuple[Sequence[ScriptureRef], str]]] = None,
1522
id_text: Optional[str] = None,
16-
strip_all_text: bool = False,
17-
prefer_existing_text: bool = False,
23+
behavior: UpdateUsfmBehavior = UpdateUsfmBehavior.PREFER_EXISTING,
1824
) -> None:
1925
super().__init__()
2026
self._rows = rows or []
2127
self._tokens: List[UsfmToken] = []
2228
self._new_tokens: List[UsfmToken] = []
2329
self._id_text = id_text
24-
self._strip_all_text = strip_all_text
25-
self._prefer_existing_text = prefer_existing_text
30+
self._behavior = behavior
2631
self._replace_stack: List[bool] = []
2732
self._row_index: int = 0
2833
self._token_index: int = 0
@@ -283,7 +288,9 @@ def _replace_with_new_tokens(self, state: UsfmParserState) -> bool:
283288
existing_text = True
284289
break
285290
use_new_tokens: bool = (
286-
self._strip_all_text or (new_text and not existing_text) or (new_text and not self._prefer_existing_text)
291+
self._behavior is UpdateUsfmBehavior.STRIP_EXISTING
292+
or (new_text and not existing_text)
293+
or (new_text and self._behavior is UpdateUsfmBehavior.PREFER_NEW)
287294
)
288295
if use_new_tokens:
289296
self._tokens.extend(self._new_tokens)

0 commit comments

Comments
 (0)