diff --git a/tools/migrations/26-02-12--normalize_simplified_article_unicode.py b/tools/migrations/26-02-12--normalize_simplified_article_unicode.py new file mode 100644 index 00000000..df402908 --- /dev/null +++ b/tools/migrations/26-02-12--normalize_simplified_article_unicode.py @@ -0,0 +1,88 @@ +""" +Migration: Normalize Unicode in simplified articles + +This script fixes articles that were created with LLM-generated content +containing decomposed Unicode characters (NFD). It normalizes them to +composed form (NFC) to fix visual rendering issues with diacritics. + +Run with: source ~/.venvs/z_env/bin/activate && python -m tools.migrations.26-02-12--normalize_simplified_article_unicode +""" + +import unicodedata +from zeeguu.core.model import db +from zeeguu.core.model.article import Article +from zeeguu.core.model.source import Source + + +def normalize_nfc(text): + """Normalize text to NFC (composed Unicode form)""" + if text is None: + return None + return unicodedata.normalize("NFC", text) + + +def has_decomposed_chars(text): + """Check if text contains decomposed Unicode characters""" + if text is None: + return False + return text != unicodedata.normalize("NFC", text) + + +def migrate(): + session = db.session + + # Find all simplified articles (those with parent_article_id set) + simplified_articles = ( + session.query(Article).filter(Article.parent_article_id.isnot(None)).all() + ) + + print(f"Found {len(simplified_articles)} simplified articles to check") + + fixed_count = 0 + for article in simplified_articles: + needs_fix = False + + # Check and fix title + if has_decomposed_chars(article.title): + print(f" Article {article.id}: Fixing title") + article.title = normalize_nfc(article.title) + needs_fix = True + + # Check and fix summary + if has_decomposed_chars(article.summary): + print(f" Article {article.id}: Fixing summary") + article.summary = normalize_nfc(article.summary) + needs_fix = True + + # Check and fix source content + if article.source_id: + source = session.query(Source).get(article.source_id) + if source and has_decomposed_chars(source.content): + print(f" Article {article.id}: Fixing source content") + source.content = normalize_nfc(source.content) + needs_fix = True + + # Check and fix HTML content + if has_decomposed_chars(article.htmlContent): + print(f" Article {article.id}: Fixing HTML content") + article.htmlContent = normalize_nfc(article.htmlContent) + needs_fix = True + + if needs_fix: + fixed_count += 1 + session.add(article) + + if fixed_count > 0: + print(f"\nFixing {fixed_count} articles...") + session.commit() + print("Done!") + else: + print("\nNo articles needed fixing.") + + +if __name__ == "__main__": + from zeeguu.api.app import create_app + + app = create_app() + with app.app_context(): + migrate() diff --git a/zeeguu/core/model/article.py b/zeeguu/core/model/article.py index 599c5aa6..ac3aba6c 100644 --- a/zeeguu/core/model/article.py +++ b/zeeguu/core/model/article.py @@ -17,6 +17,7 @@ from sqlalchemy.orm.exc import NoResultFound from sqlalchemy.types import TypeDecorator +from zeeguu.core.content_cleaning import flatten_composed_unicode_characters from zeeguu.core.language.ml_cefr_classifier import predict_cefr_level from zeeguu.core.model.ai_generator import AIGenerator from zeeguu.core.model.article_topic_map import ArticleTopicMap @@ -819,6 +820,12 @@ def create_simplified_version( from zeeguu.core.model.source import Source from zeeguu.core.model.source_type import SourceType + # Normalize Unicode to NFC (precomposed form) - LLMs may return NFD (decomposed) + # which causes visual rendering issues with diacritics (e.g., Romanian ă, â) + simplified_title = flatten_composed_unicode_characters(simplified_title) + simplified_content = flatten_composed_unicode_characters(simplified_content) + simplified_summary = flatten_composed_unicode_characters(simplified_summary) + # Create a Source object for the simplified content source_type = SourceType.find_by_type(SourceType.ARTICLE) simplified_source = Source.find_or_create(