Skip to content

Commit 2fae0c6

Browse files
ddaspitjohnml1135
andauthored
Add MT tutorial (#129)
* Add MT tutorial * Rename to "add_unk_trg_tokens" --------- Co-authored-by: John Lambert <john_lambert@sil.org>
1 parent cec61de commit 2fae0c6

File tree

12 files changed

+1622
-81
lines changed

12 files changed

+1622
-81
lines changed

.vscode/settings.json

Lines changed: 4 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,15 @@
11
{
22
"editor.formatOnSave": true,
33
"editor.codeActionsOnSave": {
4-
"source.organizeImports": "explicit",
4+
"source.organizeImports": "explicit"
55
},
66
"python.testing.unittestEnabled": false,
77
"python.testing.pytestEnabled": true,
8-
"python.analysis.extraPaths": [
9-
"tests"
10-
],
8+
"python.analysis.extraPaths": ["tests"],
119
"python.analysis.importFormat": "relative",
1210
"[python]": {
1311
"editor.defaultFormatter": "ms-python.black-formatter",
1412
"editor.formatOnSave": true
1513
},
16-
"black-formatter.path": [
17-
"poetry",
18-
"run",
19-
"black"
20-
],
21-
"python.analysis.extraPaths": [
22-
"./tests"
23-
]
24-
}
14+
"black-formatter.path": ["poetry", "run", "black"]
15+
}

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,4 +17,5 @@ If you would like to find out more about how to use Machine, check out the tutor
1717
- [Tokenization](https://githubtocolab.com/sillsdev/machine.py/blob/main/samples/tokenization.ipynb)
1818
- [Text Corpora](https://githubtocolab.com/sillsdev/machine.py/blob/main/samples/corpora.ipynb)
1919
- [Word Alignment](https://githubtocolab.com/sillsdev/machine.py/blob/main/samples/word_alignment.ipynb)
20-
- [Paratext/USFM Processing](https://githubtocolab.com/sillsdev/machine.py/blob/main/samples/paratext_usfm.ipynb)
20+
- [Machine Translation](https://githubtocolab.com/sillsdev/machine.py/blob/main/samples/machine_translation.ipynb)
21+
- [Paratext/USFM Processing](https://githubtocolab.com/sillsdev/machine.py/blob/main/samples/paratext_usfm.ipynb)

machine/jobs/huggingface/hugging_face_nmt_model_factory.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
from pathlib import Path
44
from typing import Any, cast
55

6+
import datasets.utils.logging as datasets_logging
7+
import transformers.utils.logging as transformers_logging
68
from transformers import AutoConfig, AutoModelForSeq2SeqLM, HfArgumentParser, PreTrainedModel, Seq2SeqTrainingArguments
79
from transformers.integrations import ClearMLCallback
810
from transformers.tokenization_utils import TruncationStrategy
@@ -39,6 +41,16 @@ def __init__(self, config: Any) -> None:
3941
):
4042
self._training_args.report_to.remove("clearml")
4143

44+
# The default of training_args.log_level is passive, so we set log level at info here to have that default.
45+
transformers_logging.set_verbosity_info()
46+
47+
log_level = self._training_args.get_process_log_level()
48+
logger.setLevel(log_level)
49+
datasets_logging.set_verbosity(log_level)
50+
transformers_logging.set_verbosity(log_level)
51+
transformers_logging.enable_default_handler()
52+
transformers_logging.enable_explicit_format()
53+
4254
@property
4355
def train_tokenizer(self) -> bool:
4456
return False
@@ -67,7 +79,7 @@ def create_model_trainer(self, corpus: ParallelTextCorpus) -> Trainer:
6779
src_lang=self._config.src_lang,
6880
tgt_lang=self._config.trg_lang,
6981
add_unk_src_tokens=self._config.huggingface.tokenizer.add_unk_src_tokens,
70-
add_unk_trg_tokens=self._config.huggingface.tokenizer.add_unk_trg_tokens,
82+
add_unk_tgt_tokens=self._config.huggingface.tokenizer.add_unk_trg_tokens,
7183
)
7284

7385
def create_engine(self) -> TranslationEngine:

machine/translation/huggingface/hugging_face_nmt_engine.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ def __init__(
5252
self._tokenizer = AutoTokenizer.from_pretrained(self._model.name_or_path, use_fast=True)
5353
if isinstance(self._tokenizer, (NllbTokenizer, NllbTokenizerFast)):
5454
self._mpn = MosesPunctNormalizer()
55-
self._mpn.substitutions = [(re.compile(r), sub) for r, sub in self._mpn.substitutions]
55+
self._mpn.substitutions = [(re.compile(r), sub) for r, sub in self._mpn.substitutions] # type: ignore
5656
else:
5757
self._mpn = None
5858

machine/translation/huggingface/hugging_face_nmt_model_trainer.py

Lines changed: 17 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,7 @@
66
from pathlib import Path
77
from typing import Any, Callable, List, Optional, Union, cast
88

9-
import datasets.utils.logging as datasets_logging
109
import torch # pyright: ignore[reportMissingImports]
11-
import transformers.utils.logging as transformers_logging
1210
from datasets.arrow_dataset import Dataset
1311
from sacremoses import MosesPunctNormalizer
1412
from torch import Tensor # pyright: ignore[reportMissingImports]
@@ -84,10 +82,10 @@ def __init__(
8482
corpus: Union[ParallelTextCorpus, Dataset],
8583
src_lang: Optional[str] = None,
8684
tgt_lang: Optional[str] = None,
87-
max_source_length: Optional[int] = None,
88-
max_target_length: Optional[int] = None,
85+
max_src_length: Optional[int] = None,
86+
max_tgt_length: Optional[int] = None,
8987
add_unk_src_tokens: bool = False,
90-
add_unk_trg_tokens: bool = True,
88+
add_unk_tgt_tokens: bool = True,
9189
) -> None:
9290
self._model = model
9391
self._training_args = training_args
@@ -96,12 +94,12 @@ def __init__(
9694
self._tgt_lang = tgt_lang
9795
self._trainer: Optional[Seq2SeqTrainer] = None
9896
self._metrics = {}
99-
self.max_source_length = max_source_length
100-
self.max_target_length = max_target_length
97+
self.max_src_length = max_src_length
98+
self.max_tgt_length = max_tgt_length
10199
self._add_unk_src_tokens = add_unk_src_tokens
102-
self._add_unk_trg_tokens = add_unk_trg_tokens
100+
self._add_unk_tgt_tokens = add_unk_tgt_tokens
103101
self._mpn = MosesPunctNormalizer()
104-
self._mpn.substitutions = [(re.compile(r), sub) for r, sub in self._mpn.substitutions]
102+
self._mpn.substitutions = [(re.compile(r), sub) for r, sub in self._mpn.substitutions] # type: ignore
105103
self._stats = TrainStats()
106104

107105
@property
@@ -113,17 +111,6 @@ def train(
113111
progress: Optional[Callable[[ProgressStatus], None]] = None,
114112
check_canceled: Optional[Callable[[], None]] = None,
115113
) -> None:
116-
if self._training_args.should_log:
117-
# The default of training_args.log_level is passive, so we set log level at info here to have that default.
118-
transformers_logging.set_verbosity_info()
119-
120-
log_level = self._training_args.get_process_log_level()
121-
logger.setLevel(log_level)
122-
datasets_logging.set_verbosity(log_level)
123-
transformers_logging.set_verbosity(log_level)
124-
transformers_logging.enable_default_handler()
125-
transformers_logging.enable_explicit_format()
126-
127114
last_checkpoint = None
128115
if os.path.isdir(self._training_args.output_dir) and not self._training_args.overwrite_output_dir:
129116
last_checkpoint = get_last_checkpoint(self._training_args.output_dir)
@@ -203,7 +190,7 @@ def add_tokens(tokenizer: Any, missing_tokens: List[str]) -> Any:
203190
logger.info(f"Added {len(missing_tokens)} tokens to the tokenizer: {missing_tokens}")
204191
return AutoTokenizer.from_pretrained(str(tokenizer_dir), use_fast=True)
205192

206-
if self._add_unk_src_tokens or self._add_unk_trg_tokens:
193+
if self._add_unk_src_tokens or self._add_unk_tgt_tokens:
207194
logger.info("Checking for missing tokens")
208195
if not isinstance(tokenizer, PreTrainedTokenizerFast):
209196
logger.warning(
@@ -217,7 +204,7 @@ def add_tokens(tokenizer: Any, missing_tokens: List[str]) -> Any:
217204
)
218205
# using unofficially supported behavior to set the normalizer
219206
tokenizer.backend_tokenizer.normalizer = norm_tok.backend_tokenizer.normalizer # type: ignore
220-
if self._add_unk_src_tokens and self._add_unk_trg_tokens:
207+
if self._add_unk_src_tokens and self._add_unk_tgt_tokens:
221208
lang_codes = [src_lang, tgt_lang]
222209
elif self._add_unk_src_tokens:
223210
lang_codes = [src_lang]
@@ -293,12 +280,12 @@ def add_lang_code_to_tokenizer(tokenizer: Any, lang_code: str):
293280
if model.name_or_path.startswith("t5-") or model.name_or_path.startswith("google/mt5-"):
294281
prefix = f"translate {self._src_lang} to {self._tgt_lang}: "
295282

296-
max_source_length = self.max_source_length
297-
if max_source_length is None:
298-
max_source_length = model.config.max_length
299-
max_target_length = self.max_target_length
300-
if max_target_length is None:
301-
max_target_length = model.config.max_length
283+
max_src_length = self.max_src_length
284+
if max_src_length is None:
285+
max_src_length = model.config.max_length
286+
max_tgt_length = self.max_tgt_length
287+
if max_tgt_length is None:
288+
max_tgt_length = model.config.max_length
302289

303290
if self._training_args.label_smoothing_factor > 0 and not hasattr(
304291
model, "prepare_decoder_input_ids_from_labels"
@@ -317,9 +304,9 @@ def preprocess_function(examples):
317304
inputs = [prefix + ex[src_lang] for ex in examples["translation"]]
318305
targets = [ex[tgt_lang] for ex in examples["translation"]]
319306

320-
model_inputs = tokenizer(inputs, max_length=max_source_length, truncation=True)
307+
model_inputs = tokenizer(inputs, max_length=max_src_length, truncation=True)
321308
# Tokenize targets with the `text_target` keyword argument
322-
labels = tokenizer(text_target=targets, max_length=max_target_length, truncation=True)
309+
labels = tokenizer(text_target=targets, max_length=max_tgt_length, truncation=True)
323310

324311
model_inputs["labels"] = labels["input_ids"]
325312
return model_inputs

machine/translation/translation_suggester.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
from abc import ABC, abstractmethod
2-
from typing import Iterable, Sequence
2+
from typing import Iterable, Optional, Sequence
33

4+
from .interactive_translator import InteractiveTranslator
45
from .translation_result import TranslationResult
56
from .translation_suggestion import TranslationSuggestion
7+
from .truecaser import Truecaser
68

79

810
class TranslationSuggester(ABC):
@@ -14,3 +16,13 @@ def __init__(self, confidence_threshold: float = 0, break_on_punctuation: bool =
1416
def get_suggestions(
1517
self, n: int, prefix_count: int, is_last_word_complete: bool, results: Iterable[TranslationResult]
1618
) -> Sequence[TranslationSuggestion]: ...
19+
20+
def get_suggestions_from_translator(
21+
self, n: int, translator: InteractiveTranslator, truecaser: Optional[Truecaser] = None
22+
) -> Sequence[TranslationSuggestion]:
23+
results = translator.get_current_results()
24+
if truecaser is not None:
25+
results = (
26+
truecaser.truecase_translation_result(result, translator.target_detokenizer) for result in results
27+
)
28+
return self.get_suggestions(n, len(translator.prefix_word_ranges), translator.is_last_word_complete, results)

machine/translation/truecaser.py

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,11 @@
11
from abc import ABC, abstractmethod
2-
from typing import Sequence
2+
from typing import Optional, Sequence
33

44
from ..corpora.text_corpus import TextCorpus
5+
from ..tokenization.detokenizer import Detokenizer
6+
from ..tokenization.whitespace_detokenizer import WHITESPACE_DETOKENIZER
57
from .trainer import Trainer
8+
from .translation_result import TranslationResult
69

710

811
class Truecaser(ABC):
@@ -15,5 +18,21 @@ def train_segment(self, segment: Sequence[str], sentence_start: bool = True) ->
1518
@abstractmethod
1619
def truecase(self, segment: Sequence[str]) -> Sequence[str]: ...
1720

21+
def truecase_translation_result(
22+
self, result: TranslationResult, detokenizer: Optional[Detokenizer] = None
23+
) -> TranslationResult:
24+
if detokenizer is None:
25+
detokenizer = WHITESPACE_DETOKENIZER
26+
target_tokens = self.truecase(result.target_tokens)
27+
return TranslationResult(
28+
detokenizer.detokenize(target_tokens),
29+
result.source_tokens,
30+
target_tokens,
31+
result.confidences,
32+
result.sources,
33+
result.alignment,
34+
result.phrases,
35+
)
36+
1837
@abstractmethod
1938
def save(self) -> None: ...

samples/corpora.ipynb

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -328,7 +328,7 @@
328328
},
329329
{
330330
"cell_type": "code",
331-
"execution_count": 10,
331+
"execution_count": 1,
332332
"metadata": {},
333333
"outputs": [],
334334
"source": [
@@ -371,6 +371,41 @@
371371
" print(f\"{row.ref}: {row.text}\")"
372372
]
373373
},
374+
{
375+
"cell_type": "markdown",
376+
"metadata": {},
377+
"source": [
378+
"We can extract non-Scripture portions of the project as well, such as introductory material, footnotes, section headers, etc. This feature is enabled by setting the `include_all_text` flag. Machine uses a special Scripture reference for uniquely indentifying all text segments in a Scripture book. Each text segment is referenced by its position relative to a verse and its marker."
379+
]
380+
},
381+
{
382+
"cell_type": "code",
383+
"execution_count": 2,
384+
"metadata": {},
385+
"outputs": [
386+
{
387+
"name": "stdout",
388+
"output_type": "stream",
389+
"text": [
390+
"1JN 1:0/1:ide: UTF-8\n",
391+
"1JN 1:0/2:h: 1 John\n",
392+
"1JN 1:0/3:toc1: John’s First Letter\n",
393+
"1JN 1:0/4:toc2: 1 John\n",
394+
"1JN 1:0/5:toc3: 1 John\n",
395+
"1JN 1:0/6:mt1: John’s First Letter\n",
396+
"1JN 1:1: That which was from the beginning, that which we have heard, that which we have seen with our eyes, that which we saw, and our hands touched, concerning the Word of life\n",
397+
"1JN 1:2: (and the life was revealed, and we have seen, and testify, and declare to you the life, the eternal life, which was with the Father, and was revealed to us);\n",
398+
"1JN 1:3: that which we have seen and heard we declare to you, that you also may have fellowship with us. Yes, and our fellowship is with the Father and with his Son, Jesus Christ.\n",
399+
"1JN 1:3/1:f: 1:3 “Christ” means “Anointed One”.\n"
400+
]
401+
}
402+
],
403+
"source": [
404+
"corpus = ParatextTextCorpus(\"data/WEB-PT\", include_all_text=True)\n",
405+
"for row in corpus.take(10):\n",
406+
" print(f\"{row.ref}: {row.text}\")"
407+
]
408+
},
374409
{
375410
"cell_type": "markdown",
376411
"metadata": {},

samples/data/smt.cfg

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
# Translation model prefix
2+
-tm tm/src_trg
3+
4+
# Language model
5+
-lm lm/trg.lm
6+
7+
# W parameter (maximum number of translation options to be considered per each source phrase)
8+
-W 10
9+
10+
# S parameter (maximum number of hypotheses that can be stored in each stack)
11+
-S 10
12+
13+
# A parameter (Maximum length in words of the source phrases to be translated)
14+
-A 7
15+
16+
# Degree of non-monotonicity
17+
-nomon 0
18+
19+
# Heuristic function used
20+
-h 6
21+
22+
# Best-first search flag
23+
-be
24+
25+
# Translation model weights
26+
-tmw 0 0.5 1 1 1 1 0 1
27+
28+
# Set online learning parameters (ol_alg, lr_policy, l_stepsize, em_iters, e_par, r_par)
29+
-olp 0 0 1 5 1 0

0 commit comments

Comments
 (0)