Add MT tutorial (#129)

ddaspit · johnml1135 · web-flow · commit 2fae0c6e2248 · 2024-10-17T16:07:14.000-04:00
* Add MT tutorial

* Rename to "add_unk_trg_tokens"

---------

Co-authored-by: John Lambert &lt;john_lambert@sil.org&gt;
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -1,24 +1,15 @@
 {
   "editor.formatOnSave": true,
   "editor.codeActionsOnSave": {
-    "source.organizeImports": "explicit",
+    "source.organizeImports": "explicit"
   },
   "python.testing.unittestEnabled": false,
   "python.testing.pytestEnabled": true,
-  "python.analysis.extraPaths": [
-    "tests"
-  ],
+  "python.analysis.extraPaths": ["tests"],
   "python.analysis.importFormat": "relative",
   "[python]": {
     "editor.defaultFormatter": "ms-python.black-formatter",
     "editor.formatOnSave": true
   },
-  "black-formatter.path": [
-    "poetry",
-    "run",
-    "black"
-  ],
-  "python.analysis.extraPaths": [
-    "./tests"
-  ]
-}
+  "black-formatter.path": ["poetry", "run", "black"]
+}
diff --git a/README.md b/README.md
@@ -17,4 +17,5 @@ If you would like to find out more about how to use Machine, check out the tutor
 - [Tokenization](https://githubtocolab.com/sillsdev/machine.py/blob/main/samples/tokenization.ipynb)
 - [Text Corpora](https://githubtocolab.com/sillsdev/machine.py/blob/main/samples/corpora.ipynb)
 - [Word Alignment](https://githubtocolab.com/sillsdev/machine.py/blob/main/samples/word_alignment.ipynb)
-- [Paratext/USFM Processing](https://githubtocolab.com/sillsdev/machine.py/blob/main/samples/paratext_usfm.ipynb)
+- [Machine Translation](https://githubtocolab.com/sillsdev/machine.py/blob/main/samples/machine_translation.ipynb)
+- [Paratext/USFM Processing](https://githubtocolab.com/sillsdev/machine.py/blob/main/samples/paratext_usfm.ipynb)
diff --git a/machine/jobs/huggingface/hugging_face_nmt_model_factory.py b/machine/jobs/huggingface/hugging_face_nmt_model_factory.py
@@ -3,6 +3,8 @@
 from pathlib import Path
 from typing import Any, cast
 
+import datasets.utils.logging as datasets_logging
+import transformers.utils.logging as transformers_logging
 from transformers import AutoConfig, AutoModelForSeq2SeqLM, HfArgumentParser, PreTrainedModel, Seq2SeqTrainingArguments
 from transformers.integrations import ClearMLCallback
 from transformers.tokenization_utils import TruncationStrategy
@@ -39,6 +41,16 @@ def __init__(self, config: Any) -> None:
         ):
             self._training_args.report_to.remove("clearml")
 
+        # The default of training_args.log_level is passive, so we set log level at info here to have that default.
+        transformers_logging.set_verbosity_info()
+
+        log_level = self._training_args.get_process_log_level()
+        logger.setLevel(log_level)
+        datasets_logging.set_verbosity(log_level)
+        transformers_logging.set_verbosity(log_level)
+        transformers_logging.enable_default_handler()
+        transformers_logging.enable_explicit_format()
+
     @property
     def train_tokenizer(self) -> bool:
         return False
@@ -67,7 +79,7 @@ def create_model_trainer(self, corpus: ParallelTextCorpus) -> Trainer:
             src_lang=self._config.src_lang,
             tgt_lang=self._config.trg_lang,
             add_unk_src_tokens=self._config.huggingface.tokenizer.add_unk_src_tokens,
-            add_unk_trg_tokens=self._config.huggingface.tokenizer.add_unk_trg_tokens,
+            add_unk_tgt_tokens=self._config.huggingface.tokenizer.add_unk_trg_tokens,
         )
 
     def create_engine(self) -> TranslationEngine:
diff --git a/machine/translation/huggingface/hugging_face_nmt_engine.py b/machine/translation/huggingface/hugging_face_nmt_engine.py
@@ -52,7 +52,7 @@ def __init__(
         self._tokenizer = AutoTokenizer.from_pretrained(self._model.name_or_path, use_fast=True)
         if isinstance(self._tokenizer, (NllbTokenizer, NllbTokenizerFast)):
             self._mpn = MosesPunctNormalizer()
-            self._mpn.substitutions = [(re.compile(r), sub) for r, sub in self._mpn.substitutions]
+            self._mpn.substitutions = [(re.compile(r), sub) for r, sub in self._mpn.substitutions]  # type: ignore
         else:
             self._mpn = None
 
diff --git a/machine/translation/huggingface/hugging_face_nmt_model_trainer.py b/machine/translation/huggingface/hugging_face_nmt_model_trainer.py
@@ -6,9 +6,7 @@
 from pathlib import Path
 from typing import Any, Callable, List, Optional, Union, cast
 
-import datasets.utils.logging as datasets_logging
 import torch  # pyright: ignore[reportMissingImports]
-import transformers.utils.logging as transformers_logging
 from datasets.arrow_dataset import Dataset
 from sacremoses import MosesPunctNormalizer
 from torch import Tensor  # pyright: ignore[reportMissingImports]
@@ -84,10 +82,10 @@ def __init__(
         corpus: Union[ParallelTextCorpus, Dataset],
         src_lang: Optional[str] = None,
         tgt_lang: Optional[str] = None,
-        max_source_length: Optional[int] = None,
-        max_target_length: Optional[int] = None,
+        max_src_length: Optional[int] = None,
+        max_tgt_length: Optional[int] = None,
         add_unk_src_tokens: bool = False,
-        add_unk_trg_tokens: bool = True,
+        add_unk_tgt_tokens: bool = True,
     ) -> None:
         self._model = model
         self._training_args = training_args
@@ -96,12 +94,12 @@ def __init__(
         self._tgt_lang = tgt_lang
         self._trainer: Optional[Seq2SeqTrainer] = None
         self._metrics = {}
-        self.max_source_length = max_source_length
-        self.max_target_length = max_target_length
+        self.max_src_length = max_src_length
+        self.max_tgt_length = max_tgt_length
         self._add_unk_src_tokens = add_unk_src_tokens
-        self._add_unk_trg_tokens = add_unk_trg_tokens
+        self._add_unk_tgt_tokens = add_unk_tgt_tokens
         self._mpn = MosesPunctNormalizer()
-        self._mpn.substitutions = [(re.compile(r), sub) for r, sub in self._mpn.substitutions]
+        self._mpn.substitutions = [(re.compile(r), sub) for r, sub in self._mpn.substitutions]  # type: ignore
         self._stats = TrainStats()
 
     @property
@@ -113,17 +111,6 @@ def train(
         progress: Optional[Callable[[ProgressStatus], None]] = None,
         check_canceled: Optional[Callable[[], None]] = None,
     ) -> None:
-        if self._training_args.should_log:
-            # The default of training_args.log_level is passive, so we set log level at info here to have that default.
-            transformers_logging.set_verbosity_info()
-
-        log_level = self._training_args.get_process_log_level()
-        logger.setLevel(log_level)
-        datasets_logging.set_verbosity(log_level)
-        transformers_logging.set_verbosity(log_level)
-        transformers_logging.enable_default_handler()
-        transformers_logging.enable_explicit_format()
-
         last_checkpoint = None
         if os.path.isdir(self._training_args.output_dir) and not self._training_args.overwrite_output_dir:
             last_checkpoint = get_last_checkpoint(self._training_args.output_dir)
@@ -203,7 +190,7 @@ def add_tokens(tokenizer: Any, missing_tokens: List[str]) -> Any:
             logger.info(f"Added {len(missing_tokens)} tokens to the tokenizer: {missing_tokens}")
             return AutoTokenizer.from_pretrained(str(tokenizer_dir), use_fast=True)
 
-        if self._add_unk_src_tokens or self._add_unk_trg_tokens:
+        if self._add_unk_src_tokens or self._add_unk_tgt_tokens:
             logger.info("Checking for missing tokens")
             if not isinstance(tokenizer, PreTrainedTokenizerFast):
                 logger.warning(
@@ -217,7 +204,7 @@ def add_tokens(tokenizer: Any, missing_tokens: List[str]) -> Any:
                 )
                 # using unofficially supported behavior to set the normalizer
                 tokenizer.backend_tokenizer.normalizer = norm_tok.backend_tokenizer.normalizer  # type: ignore
-                if self._add_unk_src_tokens and self._add_unk_trg_tokens:
+                if self._add_unk_src_tokens and self._add_unk_tgt_tokens:
                     lang_codes = [src_lang, tgt_lang]
                 elif self._add_unk_src_tokens:
                     lang_codes = [src_lang]
@@ -293,12 +280,12 @@ def add_lang_code_to_tokenizer(tokenizer: Any, lang_code: str):
         if model.name_or_path.startswith("t5-") or model.name_or_path.startswith("google/mt5-"):
             prefix = f"translate {self._src_lang} to {self._tgt_lang}: "
 
-        max_source_length = self.max_source_length
-        if max_source_length is None:
-            max_source_length = model.config.max_length
-        max_target_length = self.max_target_length
-        if max_target_length is None:
-            max_target_length = model.config.max_length
+        max_src_length = self.max_src_length
+        if max_src_length is None:
+            max_src_length = model.config.max_length
+        max_tgt_length = self.max_tgt_length
+        if max_tgt_length is None:
+            max_tgt_length = model.config.max_length
 
         if self._training_args.label_smoothing_factor > 0 and not hasattr(
             model, "prepare_decoder_input_ids_from_labels"
@@ -317,9 +304,9 @@ def preprocess_function(examples):
                 inputs = [prefix + ex[src_lang] for ex in examples["translation"]]
                 targets = [ex[tgt_lang] for ex in examples["translation"]]
 
-            model_inputs = tokenizer(inputs, max_length=max_source_length, truncation=True)
+            model_inputs = tokenizer(inputs, max_length=max_src_length, truncation=True)
             # Tokenize targets with the `text_target` keyword argument
-            labels = tokenizer(text_target=targets, max_length=max_target_length, truncation=True)
+            labels = tokenizer(text_target=targets, max_length=max_tgt_length, truncation=True)
 
             model_inputs["labels"] = labels["input_ids"]
             return model_inputs
diff --git a/machine/translation/translation_suggester.py b/machine/translation/translation_suggester.py
@@ -1,8 +1,10 @@
 from abc import ABC, abstractmethod
-from typing import Iterable, Sequence
+from typing import Iterable, Optional, Sequence
 
+from .interactive_translator import InteractiveTranslator
 from .translation_result import TranslationResult
 from .translation_suggestion import TranslationSuggestion
+from .truecaser import Truecaser
 
 
 class TranslationSuggester(ABC):
@@ -14,3 +16,13 @@ def __init__(self, confidence_threshold: float = 0, break_on_punctuation: bool =
     def get_suggestions(
         self, n: int, prefix_count: int, is_last_word_complete: bool, results: Iterable[TranslationResult]
     ) -> Sequence[TranslationSuggestion]: ...
+
+    def get_suggestions_from_translator(
+        self, n: int, translator: InteractiveTranslator, truecaser: Optional[Truecaser] = None
+    ) -> Sequence[TranslationSuggestion]:
+        results = translator.get_current_results()
+        if truecaser is not None:
+            results = (
+                truecaser.truecase_translation_result(result, translator.target_detokenizer) for result in results
+            )
+        return self.get_suggestions(n, len(translator.prefix_word_ranges), translator.is_last_word_complete, results)
diff --git a/machine/translation/truecaser.py b/machine/translation/truecaser.py
@@ -1,8 +1,11 @@
 from abc import ABC, abstractmethod
-from typing import Sequence
+from typing import Optional, Sequence
 
 from ..corpora.text_corpus import TextCorpus
+from ..tokenization.detokenizer import Detokenizer
+from ..tokenization.whitespace_detokenizer import WHITESPACE_DETOKENIZER
 from .trainer import Trainer
+from .translation_result import TranslationResult
 
 
 class Truecaser(ABC):
@@ -15,5 +18,21 @@ def train_segment(self, segment: Sequence[str], sentence_start: bool = True) ->
     @abstractmethod
     def truecase(self, segment: Sequence[str]) -> Sequence[str]: ...
 
+    def truecase_translation_result(
+        self, result: TranslationResult, detokenizer: Optional[Detokenizer] = None
+    ) -> TranslationResult:
+        if detokenizer is None:
+            detokenizer = WHITESPACE_DETOKENIZER
+        target_tokens = self.truecase(result.target_tokens)
+        return TranslationResult(
+            detokenizer.detokenize(target_tokens),
+            result.source_tokens,
+            target_tokens,
+            result.confidences,
+            result.sources,
+            result.alignment,
+            result.phrases,
+        )
+
     @abstractmethod
     def save(self) -> None: ...
diff --git a/samples/corpora.ipynb b/samples/corpora.ipynb
@@ -328,7 +328,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -371,6 +371,41 @@
     "    print(f\"{row.ref}: {row.text}\")"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can extract non-Scripture portions of the project as well, such as introductory material, footnotes, section headers, etc. This feature is enabled by setting the `include_all_text` flag. Machine uses a special Scripture reference for uniquely indentifying all text segments in a Scripture book. Each text segment is referenced by its position relative to a verse and its marker."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1JN 1:0/1:ide: UTF-8\n",
+      "1JN 1:0/2:h: 1 John\n",
+      "1JN 1:0/3:toc1: John’s First Letter\n",
+      "1JN 1:0/4:toc2: 1 John\n",
+      "1JN 1:0/5:toc3: 1 John\n",
+      "1JN 1:0/6:mt1: John’s First Letter\n",
+      "1JN 1:1: That which was from the beginning, that which we have heard, that which we have seen with our eyes, that which we saw, and our hands touched, concerning the Word of life\n",
+      "1JN 1:2: (and the life was revealed, and we have seen, and testify, and declare to you the life, the eternal life, which was with the Father, and was revealed to us);\n",
+      "1JN 1:3: that which we have seen and heard we declare to you, that you also may have fellowship with us. Yes, and our fellowship is with the Father and with his Son, Jesus Christ.\n",
+      "1JN 1:3/1:f: 1:3 “Christ” means “Anointed One”.\n"
+     ]
+    }
+   ],
+   "source": [
+    "corpus = ParatextTextCorpus(\"data/WEB-PT\", include_all_text=True)\n",
+    "for row in corpus.take(10):\n",
+    "    print(f\"{row.ref}: {row.text}\")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
diff --git a/samples/data/smt.cfg b/samples/data/smt.cfg
@@ -0,0 +1,29 @@
+# Translation model prefix
+-tm tm/src_trg
+
+# Language model
+-lm lm/trg.lm
+
+# W parameter (maximum number of translation options to be considered per each source phrase)
+-W 10
+
+# S parameter (maximum number of hypotheses that can be stored in each stack)
+-S 10
+
+# A parameter (Maximum length in words of the source phrases to be translated)
+-A 7
+
+# Degree of non-monotonicity
+-nomon 0
+
+# Heuristic function used
+-h 6
+
+# Best-first search flag
+-be
+
+# Translation model weights
+-tmw 0 0.5 1 1 1 1 0 1
+
+# Set online learning parameters (ol_alg, lr_policy, l_stepsize, em_iters, e_par, r_par)
+-olp 0 0 1 5 1 0
diff --git a/samples/data/sp.txt b/samples/data/sp.txt
diff --git a/samples/machine_translation.ipynb b/samples/machine_translation.ipynb
diff --git a/tests/translation/huggingface/test_hugging_face_nmt_model_trainer.py b/tests/translation/huggingface/test_hugging_face_nmt_model_trainer.py