66 skip ("skipping Hugging Face tests on MacOS" , allow_module_level = True )
77
88from tempfile import TemporaryDirectory
9-
10- from transformers import PreTrainedTokenizerFast , Seq2SeqTrainingArguments
9+ from typing import cast
10+
11+ from transformers import (
12+ M2M100Tokenizer ,
13+ MBart50Tokenizer ,
14+ MBart50TokenizerFast ,
15+ MBartTokenizer ,
16+ MBartTokenizerFast ,
17+ NllbTokenizer ,
18+ NllbTokenizerFast ,
19+ PreTrainedTokenizerFast ,
20+ Seq2SeqTrainingArguments ,
21+ )
1122
1223from machine .corpora import DictionaryTextCorpus , MemoryText , TextRow
13- from machine .translation .huggingface import HuggingFaceNmtEngine , HuggingFaceNmtModelTrainer
24+ from machine .translation .huggingface import HuggingFaceNmtEngine , HuggingFaceNmtModelTrainer , add_lang_code_to_tokenizer
1425
1526
1627def test_train_non_empty_corpus () -> None :
@@ -142,10 +153,8 @@ def test_update_tokenizer_missing_char() -> None:
142153 "Ḻ, ḻ, Ṉ, ॽ, " + " and " + "" + " are new characters"
143154 )
144155 finetuned_result_nochar_composite = finetuned_engine_nochar .tokenizer .encode ("Ḏ is a composite character" )
145- normalized_result_nochar1 = finetuned_engine_nochar .tokenizer .backend_tokenizer .normalizer .normalize_str (
146- " "
147- )
148- normalized_result_nochar2 = finetuned_engine_nochar .tokenizer .backend_tokenizer .normalizer .normalize_str ("" )
156+ norm_result_nochar1 = finetuned_engine_nochar .tokenizer .backend_tokenizer .normalizer .normalize_str (" " )
157+ norm_result_nochar2 = finetuned_engine_nochar .tokenizer .backend_tokenizer .normalizer .normalize_str ("" )
149158
150159 with HuggingFaceNmtModelTrainer (
151160 "hf-internal-testing/tiny-random-nllb" ,
@@ -167,11 +176,11 @@ def test_update_tokenizer_missing_char() -> None:
167176 "Ḻ, ḻ, Ṉ, ॽ, " + " and " + "" + " are new characters"
168177 )
169178 finetuned_result_char_composite = finetuned_engine_char .tokenizer .encode ("Ḏ is a composite character" )
170- normalized_result_char1 = finetuned_engine_char .tokenizer .backend_tokenizer .normalizer .normalize_str (" " )
171- normalized_result_char2 = finetuned_engine_char .tokenizer .backend_tokenizer .normalizer .normalize_str ("" )
179+ norm_result_char1 = finetuned_engine_char .tokenizer .backend_tokenizer .normalizer .normalize_str (" " )
180+ norm_result_char2 = finetuned_engine_char .tokenizer .backend_tokenizer .normalizer .normalize_str ("" )
172181
173- assert normalized_result_nochar1 != normalized_result_char1
174- assert normalized_result_nochar2 != normalized_result_char2
182+ assert norm_result_nochar1 != norm_result_char1
183+ assert norm_result_nochar2 != norm_result_char2
175184
176185 assert finetuned_result_nochar != finetuned_result_char
177186 assert finetuned_result_nochar_composite != finetuned_result_char_composite
@@ -467,5 +476,94 @@ def test_update_tokenizer_no_missing_char() -> None:
467476 assert finetuned_result_nochar == finetuned_result_char
468477
469478
479+ def test_nllb_tokenizer_add_lang_code () -> None :
480+ with TemporaryDirectory () as temp_dir :
481+ tokenizer = cast (NllbTokenizer , NllbTokenizer .from_pretrained ("facebook/nllb-200-distilled-600M" ))
482+ assert "new_lang" not in tokenizer .added_tokens_encoder
483+ add_lang_code_to_tokenizer (tokenizer , "new_lang" )
484+ assert "new_lang" in tokenizer .added_tokens_encoder
485+ tokenizer .save_pretrained (temp_dir )
486+ new_tokenizer = cast (NllbTokenizer , NllbTokenizer .from_pretrained (temp_dir ))
487+ assert "new_lang" in new_tokenizer .added_tokens_encoder
488+ return
489+
490+
491+ def test_nllb_tokenizer_fast_add_lang_code () -> None :
492+ with TemporaryDirectory () as temp_dir :
493+ tokenizer = cast (NllbTokenizerFast , NllbTokenizerFast .from_pretrained ("facebook/nllb-200-distilled-600M" ))
494+ assert "new_lang" not in tokenizer .added_tokens_encoder
495+ add_lang_code_to_tokenizer (tokenizer , "new_lang" )
496+ assert "new_lang" in tokenizer .added_tokens_encoder
497+ tokenizer .save_pretrained (temp_dir )
498+ new_tokenizer = cast (NllbTokenizerFast , NllbTokenizerFast .from_pretrained (temp_dir ))
499+ assert "new_lang" in new_tokenizer .added_tokens_encoder
500+ return
501+
502+
503+ def test_mbart_tokenizer_add_lang_code () -> None :
504+ with TemporaryDirectory () as temp_dir :
505+ tokenizer = cast (MBartTokenizer , MBartTokenizer .from_pretrained ("hf-internal-testing/tiny-random-nllb" ))
506+ assert "nl_NS" not in tokenizer .added_tokens_encoder
507+ add_lang_code_to_tokenizer (tokenizer , "nl_NS" )
508+ assert "nl_NS" in tokenizer .added_tokens_encoder
509+ tokenizer .save_pretrained (temp_dir )
510+ new_tokenizer = cast (MBartTokenizer , MBartTokenizer .from_pretrained (temp_dir ))
511+ assert "nl_NS" in new_tokenizer .added_tokens_encoder
512+ return
513+
514+
515+ def test_mbart_tokenizer_fast_add_lang_code () -> None :
516+ with TemporaryDirectory () as temp_dir :
517+ tokenizer = cast (MBartTokenizerFast , MBartTokenizerFast .from_pretrained ("hf-internal-testing/tiny-random-nllb" ))
518+ assert "nl_NS" not in tokenizer .added_tokens_encoder
519+ add_lang_code_to_tokenizer (tokenizer , "nl_NS" )
520+ assert "nl_NS" in tokenizer .added_tokens_encoder
521+ tokenizer .save_pretrained (temp_dir )
522+ new_tokenizer = cast (MBartTokenizerFast , MBartTokenizerFast .from_pretrained (temp_dir ))
523+ assert "nl_NS" in new_tokenizer .added_tokens_encoder
524+ return
525+
526+
527+ def test_mbart_50_tokenizer_add_lang_code () -> None :
528+ with TemporaryDirectory () as temp_dir :
529+ tokenizer = cast (MBart50Tokenizer , MBart50Tokenizer .from_pretrained ("hf-internal-testing/tiny-random-mbart50" ))
530+ assert "nl_NS" not in tokenizer .added_tokens_encoder
531+ add_lang_code_to_tokenizer (tokenizer , "nl_NS" )
532+ assert "nl_NS" in tokenizer .added_tokens_encoder
533+ tokenizer .save_pretrained (temp_dir )
534+ new_tokenizer = cast (MBart50Tokenizer , MBart50Tokenizer .from_pretrained (temp_dir ))
535+ assert "nl_NS" in new_tokenizer .added_tokens_encoder
536+ return
537+
538+
539+ def test_mbart_50_tokenizer_fast_add_lang_code () -> None :
540+ with TemporaryDirectory () as temp_dir :
541+ tokenizer = cast (
542+ MBart50TokenizerFast , MBart50TokenizerFast .from_pretrained ("hf-internal-testing/tiny-random-mbart50" )
543+ )
544+ assert "nl_NS" not in tokenizer .added_tokens_encoder
545+ add_lang_code_to_tokenizer (tokenizer , "nl_NS" )
546+ assert "nl_NS" in tokenizer .added_tokens_encoder
547+ tokenizer .save_pretrained (temp_dir )
548+ new_tokenizer = cast (MBart50TokenizerFast , MBart50TokenizerFast .from_pretrained (temp_dir ))
549+ assert "nl_NS" in new_tokenizer .added_tokens_encoder
550+ return
551+
552+
553+ def test_m2m_100_tokenizer_add_lang_code () -> None :
554+ with TemporaryDirectory () as temp_dir :
555+ tokenizer = cast (M2M100Tokenizer , M2M100Tokenizer .from_pretrained ("stas/tiny-m2m_100" ))
556+ assert "nc" not in tokenizer .lang_code_to_id
557+ assert "__nc__" not in tokenizer .added_tokens_encoder
558+ add_lang_code_to_tokenizer (tokenizer , "nc" )
559+ assert "nc" in tokenizer .lang_code_to_id
560+ assert "__nc__" in tokenizer .added_tokens_encoder
561+ tokenizer .save_pretrained (temp_dir )
562+ new_tokenizer = cast (M2M100Tokenizer , M2M100Tokenizer .from_pretrained (temp_dir ))
563+ assert "nc" in tokenizer .lang_code_to_id
564+ assert "__nc__" in new_tokenizer .added_tokens_encoder
565+ return
566+
567+
470568def _row (row_ref : int , text : str ) -> TextRow :
471569 return TextRow ("text1" , row_ref , segment = [text ])
0 commit comments