Skip to content

Commit 68cb79d

Browse files
authored
1 parent 0c3cd9c commit 68cb79d

28 files changed

+826
-342
lines changed

machine/corpora/__init__.py

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,10 @@
77
from .dbl_bundle_text_corpus import DblBundleTextCorpus
88
from .dictionary_alignment_corpus import DictionaryAlignmentCorpus
99
from .dictionary_text_corpus import DictionaryTextCorpus
10+
from .file_paratext_project_file_handler import FileParatextProjectFileHandler
1011
from .file_paratext_project_settings_parser import FileParatextProjectSettingsParser
1112
from .file_paratext_project_text_updater import FileParatextProjectTextUpdater
13+
from .file_paratext_project_versification_error_detector import FileParatextProjectVersificationErrorDetector
1214
from .flatten import flatten
1315
from .memory_alignment_collection import MemoryAlignmentCollection
1416
from .memory_stream_container import MemoryStreamContainer
@@ -18,10 +20,12 @@
1820
from .parallel_text_row import ParallelTextRow
1921
from .paratext_backup_terms_corpus import ParatextBackupTermsCorpus
2022
from .paratext_backup_text_corpus import ParatextBackupTextCorpus
23+
from .paratext_project_file_handler import ParatextProjectFileHandler
2124
from .paratext_project_settings import ParatextProjectSettings
2225
from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase
2326
from .paratext_project_terms_parser_base import ParatextProjectTermsParserBase
2427
from .paratext_project_text_updater_base import ParatextProjectTextUpdaterBase
28+
from .paratext_project_versification_error_detector import ParatextProjectVersificationErrorDetector
2529
from .paratext_text_corpus import ParatextTextCorpus
2630
from .place_markers_usfm_update_block_handler import PlaceMarkersAlignmentInfo, PlaceMarkersUsfmUpdateBlockHandler
2731
from .scripture_element import ScriptureElement
@@ -70,16 +74,22 @@
7074
from .usfm_update_block import UsfmUpdateBlock
7175
from .usfm_update_block_element import UsfmUpdateBlockElement, UsfmUpdateBlockElementType
7276
from .usfm_update_block_handler import UsfmUpdateBlockHandler
77+
from .usfm_versification_error_detector import (
78+
UsfmVersificationError,
79+
UsfmVersificationErrorDetector,
80+
UsfmVersificationErrorType,
81+
)
7382
from .usx_file_alignment_collection import UsxFileAlignmentCollection
7483
from .usx_file_alignment_corpus import UsxFileAlignmentCorpus
7584
from .usx_file_text import UsxFileText
7685
from .usx_file_text_corpus import UsxFileTextCorpus
7786
from .usx_memory_text import UsxMemoryText
7887
from .usx_zip_text import UsxZipText
88+
from .zip_paratext_project_file_handler import ZipParatextProjectFileHandler
7989
from .zip_paratext_project_settings_parser import ZipParatextProjectSettingsParser
80-
from .zip_paratext_project_settings_parser_base import ZipParatextProjectSettingsParserBase
8190
from .zip_paratext_project_terms_parser import ZipParatextProjectTermsParser
8291
from .zip_paratext_project_text_updater import ZipParatextProjectTextUpdater
92+
from .zip_paratext_project_versification_detector import ZipParatextProjectVersificationErrorDetector
8393

8494
__all__ = [
8595
"AlignedWordPair",
@@ -95,8 +105,10 @@
95105
"EMPTY_SCRIPTURE_REF",
96106
"escape_spaces",
97107
"extract_scripture_corpus",
108+
"FileParatextProjectFileHandler",
98109
"FileParatextProjectSettingsParser",
99110
"FileParatextProjectTextUpdater",
111+
"FileParatextProjectVersificationErrorDetector",
100112
"flatten",
101113
"is_scripture",
102114
"lowercase",
@@ -113,14 +125,16 @@
113125
"ParallelTextRow",
114126
"ParatextBackupTermsCorpus",
115127
"ParatextBackupTextCorpus",
128+
"ParatextProjectFileHandler",
116129
"ParatextProjectSettings",
117130
"ParatextProjectSettingsParserBase",
118131
"ParatextProjectTermsParserBase",
119132
"ParatextProjectTextUpdaterBase",
133+
"ParatextProjectVersificationErrorDetector",
120134
"ParatextTextCorpus",
135+
"parse_usfm",
121136
"PlaceMarkersAlignmentInfo",
122137
"PlaceMarkersUsfmUpdateBlockHandler",
123-
"parse_usfm",
124138
"RtlReferenceOrder",
125139
"ScriptureElement",
126140
"ScriptureRef",
@@ -139,8 +153,8 @@
139153
"unescape_spaces",
140154
"UpdateUsfmMarkerBehavior",
141155
"UpdateUsfmParserHandler",
142-
"UpdateUsfmTextBehavior",
143156
"UpdateUsfmRow",
157+
"UpdateUsfmTextBehavior",
144158
"UsfmAttribute",
145159
"UsfmElementType",
146160
"UsfmFileText",
@@ -164,14 +178,18 @@
164178
"UsfmUpdateBlockElement",
165179
"UsfmUpdateBlockElementType",
166180
"UsfmUpdateBlockHandler",
181+
"UsfmVersificationError",
182+
"UsfmVersificationErrorDetector",
183+
"UsfmVersificationErrorType",
167184
"UsxFileAlignmentCollection",
168185
"UsxFileAlignmentCorpus",
169186
"UsxFileText",
170187
"UsxFileTextCorpus",
171188
"UsxMemoryText",
172189
"UsxZipText",
190+
"ZipParatextProjectFileHandler",
173191
"ZipParatextProjectSettingsParser",
174-
"ZipParatextProjectSettingsParserBase",
175192
"ZipParatextProjectTermsParser",
176193
"ZipParatextProjectTextUpdater",
194+
"ZipParatextProjectVersificationErrorDetector",
177195
]
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
from pathlib import Path
2+
from typing import BinaryIO, Optional
3+
4+
from ..utils.typeshed import StrPath
5+
from .paratext_project_file_handler import ParatextProjectFileHandler
6+
from .usfm_stylesheet import UsfmStylesheet
7+
8+
9+
class FileParatextProjectFileHandler(ParatextProjectFileHandler):
10+
def __init__(self, project_dir: StrPath) -> None:
11+
self._project_dir = Path(project_dir)
12+
13+
def exists(self, file_name: str) -> bool:
14+
return (self._project_dir / file_name).is_file()
15+
16+
def open(self, file_name: str) -> BinaryIO:
17+
return open(self._project_dir / file_name, "rb")
18+
19+
def find(self, extension: str) -> Optional[Path]:
20+
return next(self._project_dir.glob(f"*{extension}"), None)
21+
22+
def create_stylesheet(self, file_name: str) -> UsfmStylesheet:
23+
custom_stylesheet_filename = self._project_dir / "custom.sty"
24+
return UsfmStylesheet(
25+
file_name,
26+
custom_stylesheet_filename if custom_stylesheet_filename.is_file() else None,
27+
)
Lines changed: 2 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,8 @@
1-
from pathlib import Path
2-
from typing import BinaryIO, Optional
3-
41
from ..utils.typeshed import StrPath
2+
from .file_paratext_project_file_handler import FileParatextProjectFileHandler
53
from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase
6-
from .usfm_stylesheet import UsfmStylesheet
74

85

96
class FileParatextProjectSettingsParser(ParatextProjectSettingsParserBase):
107
def __init__(self, project_dir: StrPath) -> None:
11-
self._project_dir = Path(project_dir)
12-
13-
def _create_stylesheet(self, file_name: StrPath) -> UsfmStylesheet:
14-
custom_stylesheet_filename = self._project_dir / "custom.sty"
15-
return UsfmStylesheet(
16-
file_name,
17-
custom_stylesheet_filename if custom_stylesheet_filename.is_file() else None,
18-
)
19-
20-
def _exists(self, file_name: StrPath) -> bool:
21-
return (self._project_dir / file_name).is_file()
22-
23-
def _find(self, extension: str) -> Optional[Path]:
24-
return next(self._project_dir.glob(f"*{extension}"), None)
25-
26-
def _open(self, file_name: StrPath) -> BinaryIO:
27-
return open(self._project_dir / file_name, "rb")
8+
super().__init__(FileParatextProjectFileHandler(project_dir))

machine/corpora/file_paratext_project_text_updater.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,16 @@
22
from typing import BinaryIO
33

44
from ..utils.typeshed import StrPath
5+
from .file_paratext_project_file_handler import FileParatextProjectFileHandler
56
from .file_paratext_project_settings_parser import FileParatextProjectSettingsParser
67
from .paratext_project_text_updater_base import ParatextProjectTextUpdaterBase
78

89

910
class FileParatextProjectTextUpdater(ParatextProjectTextUpdaterBase):
1011
def __init__(self, project_dir: StrPath) -> None:
11-
super().__init__(FileParatextProjectSettingsParser(project_dir))
12+
super().__init__(
13+
FileParatextProjectFileHandler(project_dir), FileParatextProjectSettingsParser(project_dir).parse()
14+
)
1215

1316
self._project_dir = project_dir
1417

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
from ..utils.typeshed import StrPath
2+
from .file_paratext_project_file_handler import FileParatextProjectFileHandler
3+
from .file_paratext_project_settings_parser import FileParatextProjectSettingsParser
4+
from .paratext_project_versification_error_detector import ParatextProjectVersificationErrorDetector
5+
6+
7+
class FileParatextProjectVersificationErrorDetector(ParatextProjectVersificationErrorDetector):
8+
def __init__(self, project_dir: StrPath) -> None:
9+
super().__init__(
10+
FileParatextProjectFileHandler(project_dir), FileParatextProjectSettingsParser(project_dir).parse()
11+
)
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
from abc import ABC, abstractmethod
2+
from typing import BinaryIO
3+
4+
from .usfm_stylesheet import UsfmStylesheet
5+
6+
7+
class ParatextProjectFileHandler(ABC):
8+
@abstractmethod
9+
def exists(self, file_name: str) -> bool: ...
10+
11+
@abstractmethod
12+
def open(self, file_name: str) -> BinaryIO: ...
13+
14+
@abstractmethod
15+
def find(self, extension: str) -> str: ...
16+
17+
@abstractmethod
18+
def create_stylesheet(self, file_name: str) -> UsfmStylesheet: ...

machine/corpora/paratext_project_settings_parser_base.py

Lines changed: 14 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,35 +1,24 @@
1-
from abc import ABC, abstractmethod
2-
from typing import BinaryIO
1+
from abc import ABC
32
from xml.etree import ElementTree
43

54
from ..scripture.verse_ref import Versification
65
from ..utils.string_utils import parse_integer
76
from .corpora_utils import get_encoding
7+
from .paratext_project_file_handler import ParatextProjectFileHandler
88
from .paratext_project_settings import ParatextProjectSettings
9-
from .usfm_stylesheet import UsfmStylesheet
109

1110

1211
class ParatextProjectSettingsParserBase(ABC):
13-
14-
@abstractmethod
15-
def _exists(self, file_name: str) -> bool: ...
16-
17-
@abstractmethod
18-
def _find(self, extension: str) -> str: ...
19-
20-
@abstractmethod
21-
def _open(self, file_name: str) -> BinaryIO: ...
22-
23-
@abstractmethod
24-
def _create_stylesheet(self, file_name: str) -> UsfmStylesheet: ...
12+
def __init__(self, paratext_project_file_handler: ParatextProjectFileHandler):
13+
self._paratext_project_file_handler = paratext_project_file_handler
2514

2615
def parse(self) -> ParatextProjectSettings:
2716
settings_file_name = "Settings.xml"
28-
if not self._exists(settings_file_name):
29-
settings_file_name = self._find(".ssf")
17+
if not self._paratext_project_file_handler.exists(settings_file_name):
18+
settings_file_name = self._paratext_project_file_handler.find(".ssf")
3019
if not settings_file_name:
3120
raise ValueError("The project does not contain a settings file.")
32-
with self._open(settings_file_name) as stream:
21+
with self._paratext_project_file_handler.open(settings_file_name) as stream:
3322
settings_tree = ElementTree.parse(stream)
3423

3524
name = settings_tree.getroot().findtext("Name", "")
@@ -46,18 +35,21 @@ def parse(self) -> ParatextProjectSettings:
4635

4736
versification_type = int(settings_tree.getroot().findtext("Versification", "4"))
4837
versification = Versification.get_builtin(versification_type)
49-
if self._exists("custom.vrs"):
38+
if self._paratext_project_file_handler.exists("custom.vrs"):
5039
guid = settings_tree.getroot().findtext("Guid", "")
5140
versification_name = f"{versification.name}-{guid}"
5241
versification = Versification.load(
53-
self._open("custom.vrs"),
42+
self._paratext_project_file_handler.open("custom.vrs"),
5443
versification,
5544
versification_name,
5645
)
5746
stylesheet_file_name = settings_tree.getroot().findtext("StyleSheet", "usfm.sty")
58-
if not self._exists(stylesheet_file_name) and stylesheet_file_name != "usfm_sb.sty":
47+
if (
48+
not self._paratext_project_file_handler.exists(stylesheet_file_name)
49+
and stylesheet_file_name != "usfm_sb.sty"
50+
):
5951
stylesheet_file_name = "usfm.sty"
60-
stylesheet = self._create_stylesheet(stylesheet_file_name)
52+
stylesheet = self._paratext_project_file_handler.create_stylesheet(stylesheet_file_name)
6153

6254
prefix = ""
6355
form = "41MAT"

machine/corpora/paratext_project_terms_parser_base.py

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
11
from __future__ import annotations
22

33
import re
4-
from abc import ABC, abstractmethod
4+
from abc import ABC
55
from collections import defaultdict
66
from importlib.resources import open_binary
7-
from typing import BinaryIO, Dict, List, Optional, Sequence, Tuple, Union
7+
from typing import Dict, List, Optional, Sequence, Tuple, Union
88
from xml.etree import ElementTree
99

10+
from .paratext_project_file_handler import ParatextProjectFileHandler
1011
from .paratext_project_settings import ParatextProjectSettings
1112
from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase
1213

@@ -24,7 +25,12 @@
2425

2526

2627
class ParatextProjectTermsParserBase(ABC):
27-
def __init__(self, settings: Union[ParatextProjectSettings, ParatextProjectSettingsParserBase]) -> None:
28+
def __init__(
29+
self,
30+
paratext_project_file_handler: ParatextProjectFileHandler,
31+
settings: Union[ParatextProjectSettings, ParatextProjectSettingsParserBase],
32+
) -> None:
33+
self._paratext_project_file_handler = paratext_project_file_handler
2834
self._settings: ParatextProjectSettings
2935
if isinstance(settings, ParatextProjectSettingsParserBase):
3036
self._settings = settings.parse()
@@ -34,8 +40,8 @@ def __init__(self, settings: Union[ParatextProjectSettings, ParatextProjectSetti
3440
def parse(self, term_categories: Sequence[str], use_term_glosses: bool = True) -> List[Tuple[str, List[str]]]:
3541
biblical_terms_doc = None
3642
if self._settings.biblical_terms_list_type == "Project":
37-
if self._exists(self._settings.biblical_terms_file_name):
38-
with self._open(self._settings.biblical_terms_file_name) as stream:
43+
if self._paratext_project_file_handler.exists(self._settings.biblical_terms_file_name):
44+
with self._paratext_project_file_handler.open(self._settings.biblical_terms_file_name) as stream:
3945
biblical_terms_doc = ElementTree.parse(stream)
4046
term_id_to_category_dict = _get_category_per_id(biblical_terms_doc)
4147
elif self._settings.biblical_terms_list_type in _PREDEFINED_TERMS_LIST_TYPES:
@@ -60,8 +66,8 @@ def parse(self, term_categories: Sequence[str], use_term_glosses: bool = True) -
6066
terms_glosses_doc = ElementTree.parse(stream)
6167

6268
term_renderings_doc: Optional[ElementTree.ElementTree[ElementTree.Element]] = None
63-
if self._exists("TermRenderings.xml"):
64-
with self._open("TermRenderings.xml") as stream:
69+
if self._paratext_project_file_handler.exists("TermRenderings.xml"):
70+
with self._paratext_project_file_handler.open("TermRenderings.xml") as stream:
6571
term_renderings_doc = ElementTree.parse(stream)
6672

6773
terms_renderings: Dict[str, List[str]] = defaultdict(list)
@@ -94,12 +100,6 @@ def parse(self, term_categories: Sequence[str], use_term_glosses: bool = True) -
94100

95101
return []
96102

97-
@abstractmethod
98-
def _exists(self, file_name: str) -> bool: ...
99-
100-
@abstractmethod
101-
def _open(self, file_name: str) -> BinaryIO: ...
102-
103103

104104
def _is_in_category(id: str, term_categories: Sequence[str], term_id_to_category_dict: Dict[str, str]) -> bool:
105105
category = term_id_to_category_dict.get(id)

machine/corpora/paratext_project_text_updater_base.py

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
1-
from abc import ABC, abstractmethod
2-
from typing import BinaryIO, Callable, Iterable, Optional, Sequence, Union
1+
from abc import ABC
2+
from typing import Callable, Iterable, Optional, Sequence, Union
33

4-
from ..utils.typeshed import StrPath
4+
from .paratext_project_file_handler import ParatextProjectFileHandler
55
from .paratext_project_settings import ParatextProjectSettings
66
from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase
77
from .update_usfm_parser_handler import (
@@ -15,7 +15,12 @@
1515

1616

1717
class ParatextProjectTextUpdaterBase(ABC):
18-
def __init__(self, settings: Union[ParatextProjectSettings, ParatextProjectSettingsParserBase]) -> None:
18+
def __init__(
19+
self,
20+
paratext_project_file_handler: ParatextProjectFileHandler,
21+
settings: Union[ParatextProjectSettings, ParatextProjectSettingsParserBase],
22+
) -> None:
23+
self._paratext_project_file_handler = paratext_project_file_handler
1924
if isinstance(settings, ParatextProjectSettingsParserBase):
2025
self._settings = settings.parse()
2126
else:
@@ -37,9 +42,9 @@ def update_usfm(
3742
compare_segments: bool = False,
3843
) -> Optional[str]:
3944
file_name: str = self._settings.get_book_file_name(book_id)
40-
if not self._exists(file_name):
45+
if not self._paratext_project_file_handler.exists(file_name):
4146
return None
42-
with self._open(file_name) as sfm_file:
47+
with self._paratext_project_file_handler.open(file_name) as sfm_file:
4348
usfm: str = sfm_file.read().decode(self._settings.encoding)
4449
handler = UpdateUsfmParserHandler(
4550
rows,
@@ -64,9 +69,3 @@ def update_usfm(
6469
f". Error: '{e}'"
6570
)
6671
raise RuntimeError(error_message) from e
67-
68-
@abstractmethod
69-
def _exists(self, file_name: StrPath) -> bool: ...
70-
71-
@abstractmethod
72-
def _open(self, file_name: StrPath) -> BinaryIO: ...

0 commit comments

Comments
 (0)