From f54077c0d6e8d36c4e6562eb1fc3e9e8a7a6e943 Mon Sep 17 00:00:00 2001 From: Carbon225 Date: Sat, 30 Nov 2024 23:25:37 +0100 Subject: [PATCH 1/6] offset the disallowed sequence index by ignored characters --- ens_normalize/normalization.py | 19 +++++++++++++------ tests/test_normalization.py | 17 +++++++++++++++++ 2 files changed, 30 insertions(+), 6 deletions(-) diff --git a/ens_normalize/normalization.py b/ens_normalize/normalization.py index 6f95efb..fed1182 100644 --- a/ens_normalize/normalization.py +++ b/ens_normalize/normalization.py @@ -558,7 +558,10 @@ def post_check_empty(name: str, input: str) -> Optional[CurableSequence]: # fully ignorable name return CurableSequence( CurableSequenceType.EMPTY_LABEL, - index=0, + # We set the index to -1 to let offset_err_start() + # know that this is the special empty name case. + # Otherwise, it would offset the index past the ignored characters. + index=-1, sequence=input, suggested='', ) @@ -581,7 +584,7 @@ def post_check_empty(name: str, input: str) -> Optional[CurableSequence]: return CurableSequence( CurableSequenceType.EMPTY_LABEL, index=i, - sequence='..', + sequence='..', # !! suggested='.', ) @@ -598,7 +601,7 @@ def post_check_underscore(label: str) -> Optional[CurableSequence]: return CurableSequence( CurableSequenceType.UNDERSCORE, index=i, - sequence='_' * cnt, + sequence='_' * cnt, # !! suggested='', ) @@ -608,7 +611,7 @@ def post_check_hyphen(label: str) -> Optional[CurableSequence]: return CurableSequence( CurableSequenceType.HYPHEN, index=2, - sequence='--', + sequence='--', # !! suggested='', ) @@ -648,7 +651,7 @@ def make_fenced_error(cps: List[int], start: int, end: int) -> CurableSequence: return CurableSequence( type_, index=start, - sequence=''.join(map(chr, cps[start:end])), + sequence=''.join(map(chr, cps[start:end])), # !! suggested=suggested, ) @@ -1097,12 +1100,16 @@ def offset_err_start(err: Optional[CurableSequence], tokens: List[Token]): Output of post_check() is not input aligned. This function offsets the error index (in-place) to match the input characters. """ + if err.index < 0: + # empty name case + err.index = 0 + return # index in string that was scanned i = 0 # offset between input and scanned offset = 0 for tok in tokens: - if i >= err.index: + if i > err.index: # everything before the error is aligned break if tok.type in (TY_IGNORED, TY_DISALLOWED): diff --git a/tests/test_normalization.py b/tests/test_normalization.py index 93950a2..37357c0 100644 --- a/tests/test_normalization.py +++ b/tests/test_normalization.py @@ -532,3 +532,20 @@ def test_simple_name_optimization(): assert len(r.cures) == 0 assert r.error is None assert r.normalizations is None + + +@pytest.mark.parametrize( + 'input_str, expected_code, expected_index, expected_sequence, expected_suggested', + [ + ('nick.\ufe0f\ufe0f.eth', 'EMPTY_LABEL', 4, '.\ufe0f\ufe0f.', '.'), + ('01\ufe0f--345', 'HYPHEN', 3, '--', ''), + ('01-\ufe0f-345', 'HYPHEN', 2, '-\ufe0f-', ''), + ("\ufe0f'b", 'FENCED_LEADING', 1, '’', ''), + ], +) +def test_suggestions_with_ignored(input_str, expected_code, expected_index, expected_sequence, expected_suggested): + e = ens_process(input_str).error + assert e.code == expected_code + assert e.index == expected_index + assert e.sequence == expected_sequence + assert e.suggested == expected_suggested From bd9d4d283c52a99f0ea69ea6d20fefc099249f1f Mon Sep 17 00:00:00 2001 From: Carbon225 Date: Sat, 30 Nov 2024 23:57:32 +0100 Subject: [PATCH 2/6] insert ignored chars into disallowed sequence --- ens_normalize/normalization.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/ens_normalize/normalization.py b/ens_normalize/normalization.py index fed1182..ac28475 100644 --- a/ens_normalize/normalization.py +++ b/ens_normalize/normalization.py @@ -1060,7 +1060,7 @@ def ens_process( label_is_greek = [] error = post_check(emojis_as_fe0f, label_is_greek, input) if isinstance(error, CurableSequence): # or NormalizableSequence because of inheritance - offset_err_start(error, tokens) + offset_err_start(error, tokens, input) # else: # only the result of post_check() is not input aligned @@ -1095,7 +1095,20 @@ def ens_process( ) -def offset_err_start(err: Optional[CurableSequence], tokens: List[Token]): +def restore_ignored_in_sequence(seq: str, input: str) -> str: + seq_out = [] + input_i = 0 + for c in seq: + # TODO: needs to handle mapped characters + while input[input_i] != c: + seq_out.append(input[input_i]) + input_i += 1 + seq_out.append(c) + input_i += 1 + return ''.join(seq_out) + + +def offset_err_start(err: Optional[CurableSequence], tokens: List[Token], input: str): """ Output of post_check() is not input aligned. This function offsets the error index (in-place) to match the input characters. @@ -1134,6 +1147,7 @@ def offset_err_start(err: Optional[CurableSequence], tokens: List[Token]): # input: cps, scanned: cps i += len(tok.cps) err.index += offset + err.sequence = restore_ignored_in_sequence(err.sequence, input[err.index :]) def ens_normalize(text: str) -> str: From b8437739437bf06a86ff8b3a4dbd4c229e1493d1 Mon Sep 17 00:00:00 2001 From: Carbon225 Date: Tue, 31 Dec 2024 22:21:21 +0100 Subject: [PATCH 3/6] Handle mapped chars in ignored suggestions --- ens_normalize/normalization.py | 40 +++++++++++++++++++++++++++++----- tests/test_normalization.py | 26 +++++++++++++++++++++- 2 files changed, 60 insertions(+), 6 deletions(-) diff --git a/ens_normalize/normalization.py b/ens_normalize/normalization.py index ac28475..65d9435 100644 --- a/ens_normalize/normalization.py +++ b/ens_normalize/normalization.py @@ -1096,15 +1096,45 @@ def ens_process( def restore_ignored_in_sequence(seq: str, input: str) -> str: + """ + Restore any ignored characters from the input string into the sequence. + + Args: + seq: The sequence to restore ignored characters into + input: The input string that may contain ignored characters + + Returns: + The sequence with ignored characters restored + """ + if not seq: + return seq + seq_out = [] input_i = 0 - for c in seq: - # TODO: needs to handle mapped characters - while input[input_i] != c: + seq_len = len(seq) + matched = 0 + + # Keep going until we've matched all characters in seq + while matched < seq_len and input_i < len(input): + # For mapped characters, we need to check if the current input char + # maps to our target sequence char + input_cp = ord(input[input_i]) + mapped_cps = NORMALIZATION.mapped.get(input_cp, [input_cp]) + target_cp = ord(seq[matched]) + + if input_cp == target_cp or target_cp in mapped_cps: + seq_out.append(input[input_i]) + matched += 1 + elif matched > 0: + # If we've started matching but hit a non-match, + # include ignored characters between matches seq_out.append(input[input_i]) - input_i += 1 - seq_out.append(c) input_i += 1 + + # If we didn't match everything, use the original sequence + if matched < seq_len: + return seq + return ''.join(seq_out) diff --git a/tests/test_normalization.py b/tests/test_normalization.py index 37357c0..19b0c09 100644 --- a/tests/test_normalization.py +++ b/tests/test_normalization.py @@ -540,7 +540,7 @@ def test_simple_name_optimization(): ('nick.\ufe0f\ufe0f.eth', 'EMPTY_LABEL', 4, '.\ufe0f\ufe0f.', '.'), ('01\ufe0f--345', 'HYPHEN', 3, '--', ''), ('01-\ufe0f-345', 'HYPHEN', 2, '-\ufe0f-', ''), - ("\ufe0f'b", 'FENCED_LEADING', 1, '’', ''), + ("\ufe0f'b", 'FENCED_LEADING', 1, "'", ''), ], ) def test_suggestions_with_ignored(input_str, expected_code, expected_index, expected_sequence, expected_suggested): @@ -549,3 +549,27 @@ def test_suggestions_with_ignored(input_str, expected_code, expected_index, expe assert e.index == expected_index assert e.sequence == expected_sequence assert e.suggested == expected_suggested + + +@pytest.mark.parametrize( + 'input_str, expected_type, expected_index, expected_sequence, expected_suggested', + [ + # Test mapped characters with ignored characters + ('aA\ufe0fA', NormalizableSequenceType.MAPPED, 1, 'A', 'a'), # Single capital A gets mapped + ('aAB', NormalizableSequenceType.MAPPED, 1, 'A', 'a'), # First capital gets mapped + # Test FE0F normalization + ('a🚴‍♂️', NormalizableSequenceType.FE0F, 1, '🚴‍♂️', '🚴‍♂'), # FE0F in emoji + # Test ignored characters + ('a\u00ad', NormalizableSequenceType.IGNORED, 1, '\u00ad', ''), # Soft hyphen is ignored + # Test FE0F as ignored + ('a\ufe0f', NormalizableSequenceType.IGNORED, 1, '\ufe0f', ''), # FE0F by itself is ignored + ], +) +def test_normalizations_with_ignored(input_str, expected_type, expected_index, expected_sequence, expected_suggested): + normalizations = ens_normalizations(input_str) + assert len(normalizations) > 0 + e = normalizations[0] # Get first normalization + assert e.type == expected_type + assert e.index == expected_index + assert e.sequence == expected_sequence + assert e.suggested == expected_suggested From e11a77347f815802b0da16c4d7de5c93b35107b2 Mon Sep 17 00:00:00 2001 From: "kwrobel.eth" Date: Thu, 23 Jan 2025 11:27:35 +0100 Subject: [PATCH 4/6] fix-worflow Signed-off-by: kwrobel.eth --- .github/workflows/test.yml | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index e7a07be..30deed2 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -18,10 +18,16 @@ jobs: matrix: os: [ubuntu-latest, windows-latest, macOS-latest] python-version: [3.8, 3.11] - + steps: - - uses: actions/checkout@v3 - + - if: ${{ github.event_name == 'pull_request' }} + uses: actions/checkout@v4 + with: + ref: ${{ github.event.pull_request.head.ref }} + + - if: ${{ github.event_name != 'pull_request' }} + uses: actions/checkout@v4 + - name: Install poetry run: pipx install poetry From 04d7286ef45ed57b90275fa4b2a3aa78104f5fad Mon Sep 17 00:00:00 2001 From: github-actions Date: Thu, 23 Jan 2025 10:32:56 +0000 Subject: [PATCH 5/6] Update coverage badge --- coverage_badge.svg | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/coverage_badge.svg b/coverage_badge.svg index e5db27c..6bfc8fa 100644 --- a/coverage_badge.svg +++ b/coverage_badge.svg @@ -15,7 +15,7 @@ coverage coverage - 100% - 100% + 99% + 99% From 902937c538de90ed983c07b3ae89e214de62e293 Mon Sep 17 00:00:00 2001 From: "kwrobel.eth" Date: Thu, 23 Jan 2025 11:37:26 +0100 Subject: [PATCH 6/6] Update test.yml Signed-off-by: kwrobel.eth --- .github/workflows/test.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 30deed2..2fe66d0 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -18,7 +18,10 @@ jobs: matrix: os: [ubuntu-latest, windows-latest, macOS-latest] python-version: [3.8, 3.11] - + + permissions: + contents: write + steps: - if: ${{ github.event_name == 'pull_request' }} uses: actions/checkout@v4