From 86b94517f1df7d705564d5a93e0d0c431ccd9120 Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Tue, 13 Apr 2021 18:52:28 +0200 Subject: [PATCH 01/40] Enable packratting for pyparser Delivers significant performance improvements by caching previously computed results. --- edtf/parser/grammar.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/edtf/parser/grammar.py b/edtf/parser/grammar.py index c028c6e..d612c5f 100644 --- a/edtf/parser/grammar.py +++ b/edtf/parser/grammar.py @@ -1,5 +1,9 @@ from pyparsing import Literal as L, ParseException, Optional, OneOrMore, \ - ZeroOrMore, oneOf, Regex, Combine, Word, NotAny, nums + ZeroOrMore, oneOf, Regex, Combine, Word, NotAny, nums, ParserElement + +# From the pyparsing performance improvement tips: +# https://github.com/pyparsing/pyparsing/wiki/Performance-Tips +ParserElement.enablePackrat() # (* ************************** Level 0 *************************** *) from edtf.parser.parser_classes import Date, DateAndTime, Interval, Unspecified, \ From 7fdf8dd8b649a5085d8f2aed3b66a8734f2ce915 Mon Sep 17 00:00:00 2001 From: jacobcolyvan Date: Mon, 26 Jul 2021 12:29:25 +1000 Subject: [PATCH 02/40] #37 update for Django 3.x compat --- edtf/fields.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/edtf/fields.py b/edtf/fields.py index 83d10a7..52b9171 100644 --- a/edtf/fields.py +++ b/edtf/fields.py @@ -53,7 +53,7 @@ def deconstruct(self): del kwargs["max_length"] return name, path, args, kwargs - def from_db_value(self, value, expression, connection, context): + def from_db_value(self, value, expression, connection, context=None): # Converting values to Python objects if not value: return None From 6e4a627df5447b76db492b1603f95bbd55524346 Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Fri, 26 Apr 2024 15:43:38 +0200 Subject: [PATCH 03/40] Minor updates --- edtf/natlang/en.py | 3 ++- poetry.lock | 45 +++++++++++++++++++++++++++++++++++++++++++++ pyproject.toml | 18 ++++++++++++++++++ 3 files changed, 65 insertions(+), 1 deletion(-) create mode 100644 poetry.lock create mode 100644 pyproject.toml diff --git a/edtf/natlang/en.py b/edtf/natlang/en.py index ec7842b..5263e07 100644 --- a/edtf/natlang/en.py +++ b/edtf/natlang/en.py @@ -89,6 +89,7 @@ def text_to_edtf(text): is_before = re.findall(r'\bbefore\b', t) is_before = is_before or re.findall(r'\bearlier\b', t) + is_before = is_before or re.findall(r'\baprés\b', t) is_after = re.findall(r'\bafter\b', t) is_after = is_after or re.findall(r'\bsince\b', t) @@ -133,7 +134,7 @@ def text_to_edtf_date(text): is_approximate = is_approximate or re.findall(r'\bcirca\b', t) # the word 'approx'/'around'/'about' anywhere is_approximate = is_approximate or \ - re.findall(r'\b(approx|around|about)', t) + re.findall(r'\b(approx|approximately|around|about)', t) # a ~ before a year-ish number is_approximate = is_approximate or re.findall(r'\b~\d{4}', t) # a ~ at the beginning diff --git a/poetry.lock b/poetry.lock new file mode 100644 index 0000000..745843e --- /dev/null +++ b/poetry.lock @@ -0,0 +1,45 @@ +# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. + +[[package]] +name = "pyparsing" +version = "3.1.2" +description = "pyparsing module - Classes and methods to define and execute parsing grammars" +optional = false +python-versions = ">=3.6.8" +files = [ + {file = "pyparsing-3.1.2-py3-none-any.whl", hash = "sha256:f9db75911801ed778fe61bb643079ff86601aca99fcae6345aa67292038fb742"}, + {file = "pyparsing-3.1.2.tar.gz", hash = "sha256:a1bac0ce561155ecc3ed78ca94d3c9378656ad4c94c1270de543f621420f94ad"}, +] + +[package.extras] +diagrams = ["jinja2", "railroad-diagrams"] + +[[package]] +name = "python-dateutil" +version = "2.9.0.post0" +description = "Extensions to the standard Python datetime module" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" +files = [ + {file = "python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3"}, + {file = "python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427"}, +] + +[package.dependencies] +six = ">=1.5" + +[[package]] +name = "six" +version = "1.16.0" +description = "Python 2 and 3 compatibility utilities" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" +files = [ + {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, + {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, +] + +[metadata] +lock-version = "2.0" +python-versions = "^3.11" +content-hash = "822c6f7ddf2552d097c1bfc8399a2492c845c74cb4576a423adf3ad62850ffc3" diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..f203360 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,18 @@ +[tool.poetry] +name = "python-edtf" +version = "0.1.0" +description = "" +authors = ["Andrew Hankinson "] +readme = "README.md" +packages = [{include = "python_edtf"}] + +[tool.poetry.dependencies] +python = "^3.11" +python-dateutil = "^2.9.0.post0" +pyparsing = "^3.1.2" +six = "^1.16.0" + + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" From 80fdd60cbb590d7139341293185628d6aa8cac5b Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Fri, 26 Apr 2024 15:49:58 +0200 Subject: [PATCH 04/40] Update dependency management --- pyproject.toml | 2 +- setup.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index f203360..f1d7c5f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [tool.poetry] -name = "python-edtf" +name = "edtf" version = "0.1.0" description = "" authors = ["Andrew Hankinson "] diff --git a/setup.py b/setup.py index f0f1849..f2cc7d5 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,6 @@ from __future__ import print_function import setuptools -import sys def readme(): with open('README.md') as f: From c12d759732d393ac66faa462b8d61b057c675d17 Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Fri, 26 Apr 2024 15:55:52 +0200 Subject: [PATCH 05/40] Deps --- poetry.lock | 4 ++-- pyproject.toml | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/poetry.lock b/poetry.lock index 745843e..c4b40b6 100644 --- a/poetry.lock +++ b/poetry.lock @@ -41,5 +41,5 @@ files = [ [metadata] lock-version = "2.0" -python-versions = "^3.11" -content-hash = "822c6f7ddf2552d097c1bfc8399a2492c845c74cb4576a423adf3ad62850ffc3" +python-versions = "^3.9" +content-hash = "e6be32f86f1a6af0695f6846b57ed289e015b5634c7f574c45800095a84e2200" diff --git a/pyproject.toml b/pyproject.toml index f1d7c5f..9af9ee4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,13 +1,13 @@ [tool.poetry] name = "edtf" -version = "0.1.0" +version = "4.0.1+enh" description = "" authors = ["Andrew Hankinson "] readme = "README.md" -packages = [{include = "python_edtf"}] +packages = [{include = "edtf"}] [tool.poetry.dependencies] -python = "^3.11" +python = "^3.9" python-dateutil = "^2.9.0.post0" pyparsing = "^3.1.2" six = "^1.16.0" From 6e508d016e9bbcc49b90d3c88ca3512d69a0d193 Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Tue, 23 Jul 2024 17:03:14 +0200 Subject: [PATCH 06/40] Optimized regexes --- edtf/natlang/en.py | 126 ++++++++++++++++++++++----------------------- 1 file changed, 63 insertions(+), 63 deletions(-) diff --git a/edtf/natlang/en.py b/edtf/natlang/en.py index 5263e07..4f68f21 100644 --- a/edtf/natlang/en.py +++ b/edtf/natlang/en.py @@ -1,9 +1,10 @@ """Utilities to derive an EDTF string from an (English) natural language string.""" from datetime import datetime +from typing import Optional + from dateutil.parser import parse import re from edtf import appsettings -from six.moves import xrange # two dates where every digit of an ISO date representation is different, @@ -12,24 +13,43 @@ DEFAULT_DATE_1 = datetime(1234, 1, 1, 0, 0) DEFAULT_DATE_2 = datetime(5678, 10, 10, 0, 0) -SHORT_YEAR_RE = r'(-?)([\du])([\dxu])([\dxu])([\dxu])' -LONG_YEAR_RE = r'y(-?)([1-9]\d\d\d\d+)' -CENTURY_RE = r'(\d{1,2})(c\.?|(st|nd|rd|th) century)\s?(ad|ce|bc|bce)?' -CE_RE = r'(\d{1,4}) (ad|ce|bc|bce)' +SHORT_YEAR_RE = re.compile(r'(-?)([\du])([\dxu])([\dxu])([\dxu])') +LONG_YEAR_RE = re.compile(r'y(-?)([1-9]\d\d\d\d+)') +CENTURY_RE = re.compile(r'(\d{1,2})(c\.?|(st|nd|rd|th) century)\s?(ad|ce|bc|bce)?') +CENTURY_RANGE = re.compile(r'\b(\d\d)(th|st|nd|rd|)-(\d\d)(th|st|nd|rd) [cC]') +CE_RE = re.compile(r'(\d{1,4}) (ad|ce|bc|bce)') +ONE_DIGIT_PARTIAL_FIRST = re.compile(r'\d\D\b') +TWO_DIGIT_PARTIAL_FIRST = re.compile(r'\d\d\b') +PARTIAL_CHECK = re.compile(r'\b\d\d\d\d$') +SLASH_YEAR = re.compile(r"(\d\d\d\d)/(\d\d\d\d)") +BEFORE_CHECK = re.compile(r"\b(?:before|earlier|avant)\b") +AFTER_CHECK = re.compile(r"\b(after|since|later|aprés|apres)\b") +APPROX_CHECK = re.compile(r'\b(?:ca?\.? ?\d{4}|circa|approx|approximately|around|about|~\d{3,4})|(?:^~)') +UNCERTAIN_CHECK = re.compile(r"\b(?:uncertain|possibly|maybe|guess|\d{3,4}\?)") +UNCERTAIN_REPL = re.compile(r'(\d{4})\?') +MIGHT_BE_CENTURY = re.compile(r'(\d{2}00)s') +MIGHT_BE_DECADE = re.compile(r'(\d{3}0)s') + +APPROX_CENTURY_RE = re.compile(r'\b(ca?\.?) ?(\d{1,2})(c\.?|(st|nd|rd|th) century)\s?(ad|ce|bc|bce)?') +UNCERTAIN_CENTURY_RE = re.compile(r'(\d{1,2})(c\.?|(st|nd|rd|th) century)\s?(ad|ce|bc|bce)?\?') + +APPROX_CE_RE = re.compile(r'\b(ca?\.?) ?(\d{1,4}) (ad|ce|bc|bce)') +UNCERTAIN_CE_RE = re.compile(r'(\d{1,4}) (ad|ce|bc|bce)\?') + # Set of RE rules that will cause us to abort text processing, since we know # the results will be wrong. REJECT_RULES = ( - r'.*dynasty.*', # Don't parse '23rd Dynasty' to 'uuuu-uu-23' + re.compile(r'.*dynasty.*'), # Don't parse '23rd Dynasty' to 'uuuu-uu-23' ) -def text_to_edtf(text): +def text_to_edtf(text: str) -> Optional[str]: """ Generate EDTF string equivalent of a given natural language date string. """ if not text: - return + return None t = text.lower() @@ -51,18 +71,18 @@ def text_to_edtf(text): # match looks from the beginning of the string, search # looks anywhere. - if re.match(r'\d\D\b', d2): # 1-digit year partial e.g. 1868-9 - if re.search(r'\b\d\d\d\d$', d1): # TODO: evaluate it and see if it's a year + if re.match(ONE_DIGIT_PARTIAL_FIRST, d2): # 1-digit year partial e.g. 1868-9 + if re.search(PARTIAL_CHECK, d1): # TODO: evaluate it and see if it's a year d2 = d1[-4:-1] + d2 - elif re.match(r'\d\d\b', d2): # 2-digit year partial e.g. 1809-10 - if re.search(r'\b\d\d\d\d$', d1): + elif re.match(TWO_DIGIT_PARTIAL_FIRST, d2): # 2-digit year partial e.g. 1809-10 + if re.search(PARTIAL_CHECK, d1): d2 = d1[-4:-2] + d2 else: - century_range_match = re.search(r'\b(\d\d)(th|st|nd|rd|)-(\d\d)(th|st|nd|rd) [cC]', "%s-%s" % (d1,d2)) + century_range_match = re.search(CENTURY_RANGE, f"{d1}-{d2}") if century_range_match: g = century_range_match.groups() - d1 = "%sC" % g[0] - d2 = "%sC" % g[2] + d1 = f"{g[0]}C" + d2 = f"{g[2]}C" r1 = text_to_edtf_date(d1) r2 = text_to_edtf_date(d2) @@ -77,9 +97,9 @@ def text_to_edtf(text): # This whole section could be more friendly. else: - int_match = re.search(r"(\d\d\d\d)\/(\d\d\d\d)", list_item) + int_match = re.search(SLASH_YEAR, list_item) if int_match: - return "[%s, %s]" % (int_match.group(1), int_match.group(2)) + return f"[{int_match.group(1)}, {int_match.group(2)}]" result = text_to_edtf_date(list_item) if result: @@ -87,23 +107,18 @@ def text_to_edtf(text): if result: break - is_before = re.findall(r'\bbefore\b', t) - is_before = is_before or re.findall(r'\bearlier\b', t) - is_before = is_before or re.findall(r'\baprés\b', t) - - is_after = re.findall(r'\bafter\b', t) - is_after = is_after or re.findall(r'\bsince\b', t) - is_after = is_after or re.findall(r'\blater\b', t) + is_before = re.findall(BEFORE_CHECK, t) + is_after = re.findall(AFTER_CHECK, t) if is_before: - result = u"unknown/%s" % result + result = f"unknown/{result}" elif is_after: - result = u"%s/unknown" % result + result = f"{result}/unknown" return result -def text_to_edtf_date(text): +def text_to_edtf_date(text) -> Optional[str]: """ Return EDTF string equivalent of a given natural language date string. @@ -112,39 +127,29 @@ def text_to_edtf_date(text): differ are undefined. """ if not text: - return + return None t = text.lower() result = '' for reject_re in REJECT_RULES: if re.match(reject_re, t): - return + return None # matches on '1800s'. Needs to happen before is_decade. - could_be_century = re.findall(r'(\d{2}00)s', t) + could_be_century: list = re.findall(MIGHT_BE_CENTURY, t) # matches on '1800s' and '1910s'. Removes the 's'. # Needs to happen before is_uncertain because e.g. "1860s?" - t, is_decade = re.subn(r'(\d{3}0)s', r'\1', t) + t, is_decade = re.subn(MIGHT_BE_DECADE, r'\1', t) # detect approximation signifiers # a few 'circa' abbreviations just before the year - is_approximate = re.findall(r'\b(ca?\.?) ?\d{4}', t) + is_approximate = re.findall(APPROX_CHECK, t) # the word 'circa' anywhere - is_approximate = is_approximate or re.findall(r'\bcirca\b', t) - # the word 'approx'/'around'/'about' anywhere - is_approximate = is_approximate or \ - re.findall(r'\b(approx|approximately|around|about)', t) - # a ~ before a year-ish number - is_approximate = is_approximate or re.findall(r'\b~\d{4}', t) - # a ~ at the beginning - is_approximate = is_approximate or re.findall(r'^~', t) # detect uncertainty signifiers - t, is_uncertain = re.subn(r'(\d{4})\?', r'\1', t) - # the words uncertain/maybe/guess anywhere - is_uncertain = is_uncertain or re.findall( - r'\b(uncertain|possibly|maybe|guess)', t) + t, is_uncertain = re.subn(UNCERTAIN_REPL, r'\1', t) + is_uncertain = is_uncertain or re.findall(UNCERTAIN_CHECK, t) # detect century forms is_century = re.findall(CENTURY_RE, t) @@ -153,27 +158,23 @@ def text_to_edtf_date(text): is_ce = re.findall(CE_RE, t) if is_century: result = "%02dxx" % (int(is_century[0][0]) - 1,) - is_approximate = is_approximate or \ - re.findall(r'\b(ca?\.?) ?' + CENTURY_RE, t) - is_uncertain = is_uncertain or re.findall(CENTURY_RE + r'\?', t) + is_approximate = is_approximate or re.findall(APPROX_CENTURY_RE, t) + is_uncertain = is_uncertain or re.findall(UNCERTAIN_CENTURY_RE, t) try: - is_bc = is_century[0][-1] in ("bc", "bce") - if is_bc: - result = "-%s" % result + if is_century[0][-1] in ("bc", "bce"): + result = f"-{result}" except IndexError: pass elif is_ce: result = "%04d" % (int(is_ce[0][0])) - is_approximate = is_approximate or \ - re.findall(r'\b(ca?\.?) ?' + CE_RE, t) - is_uncertain = is_uncertain or re.findall(CE_RE + r'\?', t) + is_approximate = is_approximate or re.findall(APPROX_CE_RE, t) + is_uncertain = is_uncertain or re.findall(UNCERTAIN_CE_RE, t) try: - is_bc = is_ce[0][-1] in ("bc", "bce") - if is_bc: - result = "-%s" % result + if is_ce[0][-1] in ("bc", "bce"): + result = f"-{result}" except IndexError: pass @@ -200,12 +201,12 @@ def text_to_edtf_date(text): ) except ValueError: - return + return None if dt1.date() == DEFAULT_DATE_1.date() and \ dt2.date() == DEFAULT_DATE_2.date(): # couldn't parse anything - defaults are untouched. - return + return None date1 = dt1.isoformat()[:10] date2 = dt2.isoformat()[:10] @@ -215,14 +216,13 @@ def text_to_edtf_date(text): mentions_month = re.findall(r'\bmonth\b.+(in|during)\b', t) mentions_day = re.findall(r'\bday\b.+(in|during)\b', t) - for i in xrange(len(date1)): + for i in range(len(date1)): # if the given year could be a century (e.g. '1800s') then use # approximate/uncertain markers to decide whether we treat it as # a century or a decade. - if i == 2 and could_be_century and \ - not (is_approximate or is_uncertain): + if i == 2 and could_be_century and not (is_approximate or is_uncertain): result += 'x' - elif i == 3 and is_decade > 0: + elif i == 3 and is_decade: if mentions_year: result += 'u' # year precision else: @@ -238,7 +238,7 @@ def text_to_edtf_date(text): # strip off unknown chars from end of string - except the first 4 - for i in reversed(xrange(len(result))): + for i in reversed(range(len(result))): if result[i] not in ('u', 'x', '-'): smallest_length = 4 From f2252f03c23b1f7a6a153ccf750e97a94ce71dd2 Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Tue, 23 Jul 2024 17:18:26 +0200 Subject: [PATCH 07/40] Package updates --- edtf/convert.py | 8 +- edtf/jdutil.py | 32 +++---- edtf/natlang/en.py | 11 ++- edtf/natlang/tests.py | 4 +- edtf/parser/grammar.py | 14 +-- edtf/parser/parser_classes.py | 159 +++++++++++++++++----------------- edtf/parser/tests.py | 66 +++++++------- 7 files changed, 152 insertions(+), 142 deletions(-) diff --git a/edtf/convert.py b/edtf/convert.py index c1bfd3a..de1f2a2 100644 --- a/edtf/convert.py +++ b/edtf/convert.py @@ -59,8 +59,7 @@ def trim_struct_time(st, strip_time=False): """ if strip_time: return struct_time(list(st[:3]) + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS) - else: - return struct_time(list(st[:6]) + TIME_EMPTY_EXTRAS) + return struct_time(list(st[:6]) + TIME_EMPTY_EXTRAS) def struct_time_to_jd(st): @@ -106,7 +105,7 @@ def jd_to_struct_time(jd): ) -def _roll_negative_time_fields(year, month, day, hour, minute, second): +def _roll_negative_time_fields(year, month, day, hour, minute, second) -> tuple: """ Fix date/time fields which have nonsense negative values for any field except for year by rolling the overall date/time value backwards, treating @@ -142,4 +141,5 @@ def _roll_negative_time_fields(year, month, day, hour, minute, second): year += int(month / 12.0) # Adjust by whole year in months year -= 1 # Subtract 1 for negative minutes month %= 12 # Convert negative month to positive remainder - return (year, month, day, hour, minute, second) + + return year, month, day, hour, minute, second diff --git a/edtf/jdutil.py b/edtf/jdutil.py index 9fabdd1..4a12b58 100644 --- a/edtf/jdutil.py +++ b/edtf/jdutil.py @@ -17,7 +17,8 @@ # 10-14-1582 never occurred. Python datetime objects will produce incorrect # time deltas if one date is from before 10-15-1582. -def mjd_to_jd(mjd): + +def mjd_to_jd(mjd: float) -> float: """ Convert Modified Julian Day to Julian Day. @@ -30,13 +31,11 @@ def mjd_to_jd(mjd): ------- jd : float Julian Day - - """ return mjd + 2400000.5 -def jd_to_mjd(jd): +def jd_to_mjd(jd: float) -> float: """ Convert Julian Day to Modified Julian Day @@ -54,7 +53,7 @@ def jd_to_mjd(jd): return jd - 2400000.5 -def date_to_jd(year,month,day): +def date_to_jd(year: int, month: int, day: float) -> float: """ Convert a date to Julian Day. @@ -117,7 +116,7 @@ def date_to_jd(year,month,day): return jd -def jd_to_date(jd): +def jd_to_date(jd: float) -> (int, int, float): """ Convert Julian Day to date. @@ -184,7 +183,10 @@ def jd_to_date(jd): return year, month, day -def hmsm_to_days(hour=0,min=0,sec=0,micro=0): +def hmsm_to_days(hour: int = 0, + min: int = 0, + sec: int = 0, + micro: int = 0) -> float: """ Convert hours, minutes, seconds, and microseconds to fractional days. @@ -222,7 +224,7 @@ def hmsm_to_days(hour=0,min=0,sec=0,micro=0): return days / 24. -def days_to_hmsm(days): +def days_to_hmsm(days: float) -> (int, int, int, int): """ Convert fractional days to hours, minutes, seconds, and microseconds. Precision beyond microseconds is rounded to the nearest microsecond. @@ -271,7 +273,7 @@ def days_to_hmsm(days): return int(hour), int(min), int(sec), int(micro) -def datetime_to_jd(date): +def datetime_to_jd(date: dt.datetime) -> float: """ Convert a `datetime.datetime` object to Julian Day. @@ -298,7 +300,7 @@ def datetime_to_jd(date): return date_to_jd(date.year,date.month,days) -def jd_to_datetime(jd): +def jd_to_datetime(jd: float) -> dt.datetime: """ Convert a Julian Day to an `jdutil.datetime` object. @@ -328,7 +330,7 @@ def jd_to_datetime(jd): return datetime(year,month,day,hour,min,sec,micro) -def timedelta_to_days(td): +def timedelta_to_days(td: dt.timedelta) -> float: """ Convert a `datetime.timedelta` object to a total number of days. @@ -372,7 +374,7 @@ class datetime(dt.datetime): datetime.datetime : Parent class. """ - def __add__(self,other): + def __add__(self, other): if not isinstance(other,dt.timedelta): s = "jdutil.datetime supports '+' only with datetime.timedelta" raise TypeError(s) @@ -383,7 +385,7 @@ def __add__(self,other): return jd_to_datetime(combined) - def __radd__(self,other): + def __radd__(self, other): if not isinstance(other,dt.timedelta): s = "jdutil.datetime supports '+' only with datetime.timedelta" raise TypeError(s) @@ -394,7 +396,7 @@ def __radd__(self,other): return jd_to_datetime(combined) - def __sub__(self,other): + def __sub__(self, other): if isinstance(other,dt.timedelta): days = timedelta_to_days(other) @@ -412,7 +414,7 @@ def __sub__(self,other): s += "datetime.timedelta, jdutil.datetime and datetime.datetime" raise TypeError(s) - def __rsub__(self,other): + def __rsub__(self, other): if not isinstance(other, (datetime,dt.datetime)): s = "jdutil.datetime supports '-' with: " s += "jdutil.datetime and datetime.datetime" diff --git a/edtf/natlang/en.py b/edtf/natlang/en.py index 4f68f21..8cb72c4 100644 --- a/edtf/natlang/en.py +++ b/edtf/natlang/en.py @@ -36,6 +36,11 @@ APPROX_CE_RE = re.compile(r'\b(ca?\.?) ?(\d{1,4}) (ad|ce|bc|bce)') UNCERTAIN_CE_RE = re.compile(r'(\d{1,4}) (ad|ce|bc|bce)\?') +MENTIONS_YEAR = re.compile(r'\byear\b.+(in|during)\b') +MENTIONS_MONTH = re.compile(r'\bmonth\b.+(in|during)\b') +MENTIONS_DAY = re.compile(r'\bday\b.+(in|during)\b') + + # Set of RE rules that will cause us to abort text processing, since we know # the results will be wrong. @@ -212,9 +217,9 @@ def text_to_edtf_date(text) -> Optional[str]: date2 = dt2.isoformat()[:10] # guess precision of 'unspecified' characters to use - mentions_year = re.findall(r'\byear\b.+(in|during)\b', t) - mentions_month = re.findall(r'\bmonth\b.+(in|during)\b', t) - mentions_day = re.findall(r'\bday\b.+(in|during)\b', t) + mentions_year = re.findall(MENTIONS_YEAR, t) + mentions_month = re.findall(MENTIONS_MONTH, t) + mentions_day = re.findall(MENTIONS_DAY, t) for i in range(len(date1)): # if the given year could be a century (e.g. '1800s') then use diff --git a/edtf/natlang/tests.py b/edtf/natlang/tests.py index ea137d2..d18ec76 100644 --- a/edtf/natlang/tests.py +++ b/edtf/natlang/tests.py @@ -207,8 +207,8 @@ def test_natlang(self): """ for i, o in EXAMPLES: e = text_to_edtf(i) - print("%s => %s" % (i, e)) - self.assertEqual(e, o) + print(f"{i} => {e}") + self.assertEqual(e, o, msg=f"Testing {i}") if __name__ == '__main__': diff --git a/edtf/parser/grammar.py b/edtf/parser/grammar.py index d612c5f..14cb3a4 100644 --- a/edtf/parser/grammar.py +++ b/edtf/parser/grammar.py @@ -282,14 +282,16 @@ def f(toks): edtfParser = level0Expression("level0") ^ level1Expression("level1") ^ level2Expression("level2") -def parse_edtf(str, parseAll=True, fail_silently=False): +def parse_edtf(inp: str, parse_all: bool = True, fail_silently: bool = False): + if not inp: + raise ParseException("You must supply some input text") + try: - if not str: - raise ParseException("You must supply some input text") - p = edtfParser.parseString(str.strip(), parseAll) - if p: - return p[0] + p = edtfParser.parseString(inp.strip(), parse_all) except ParseException as e: if fail_silently: return None raise EDTFParseException(e) + + if p: + return p[0] diff --git a/edtf/parser/parser_classes.py b/edtf/parser/parser_classes.py index b670296..ae7adb4 100644 --- a/edtf/parser/parser_classes.py +++ b/edtf/parser/parser_classes.py @@ -3,6 +3,7 @@ from time import struct_time from datetime import date, datetime from operator import add, sub +from typing import Optional from dateutil.relativedelta import relativedelta @@ -22,7 +23,7 @@ PRECISION_DAY = "day" -def days_in_month(year, month): +def days_in_month(year: int, month: int) -> dict: """ Return the number of days in the given year and month, where month is 1=January to 12=December, and respecting leap years as identified by @@ -85,11 +86,15 @@ def apply_delta(op, time_struct, delta): class EDTFObject(object): """ - Object to attact to a parser to become instantiated when the parser + Object to attach to a parser to become instantiated when the parser completes. """ parser = None + def __init__(self, *args, **kwargs): + errmsg: str = f"{type(self).__name__}.__init__(*{args}, **{kwargs})" + raise NotImplementedError(f"{errmsg} is not implemented.") + @classmethod def set_parser(cls, p): cls.parser = p @@ -99,7 +104,7 @@ def set_parser(cls, p): def parse_action(cls, toks): kwargs = toks.asDict() try: - return cls(**kwargs) # replace the token list with the class + return cls(**kwargs) # replace the token list with the class except Exception as e: print("trying to %s.__init__(**%s)" % (cls.__name__, kwargs)) raise e @@ -109,19 +114,12 @@ def parse(cls, s): return cls.parser.parseString(s)[0] def __repr__(self): - return "%s: '%s'" % (type(self).__name__, str(self)) - - def __init__(self, *args, **kwargs): - str = "%s.__init__(*%s, **%s)" % ( - type(self).__name__, - args, kwargs, - ) - raise NotImplementedError("%s is not implemented." % str) + return f"{type(self).__name__}: '{str(self)}'" def __str__(self): raise NotImplementedError - def _strict_date(self, lean): + def _strict_date(self, lean: str): raise NotImplementedError def lower_strict(self): @@ -130,7 +128,7 @@ def lower_strict(self): def upper_strict(self): return self._strict_date(lean=LATEST) - def _get_fuzzy_padding(self, lean): + def _get_fuzzy_padding(self, lean: str): """ Subclasses should override this to pad based on how precise they are. """ @@ -216,41 +214,40 @@ def __le__(self, other): # (* ************************** Level 0 *************************** *) class Date(EDTFObject): + def __init__(self, year=None, month=None, day=None, **kwargs): + for param in ('date', 'lower', 'upper'): + if param in kwargs: + self.__init__(**kwargs[param]) + return + + self.year = year # Year is required, but sometimes passed in as a 'date' dict. + self.month = month + self.day = day - def set_year(self, y): + def set_year(self, y: int): if y is None: raise AttributeError("Year must not be None") self._year = y - def get_year(self): + def get_year(self) -> int: return self._year year = property(get_year, set_year) - def set_month(self, m): + def set_month(self, m: Optional[int]): self._month = m - if m == None: + if m is None: self.day = None - def get_month(self): + def get_month(self) -> Optional[int]: return self._month month = property(get_month, set_month) - def __init__(self, year=None, month=None, day=None, **kwargs): - for param in ('date', 'lower', 'upper'): - if param in kwargs: - self.__init__(**kwargs[param]) - return - - self.year = year # Year is required, but sometimes passed in as a 'date' dict. - self.month = month - self.day = day - def __str__(self): r = self.year if self.month: - r += "-%s" % self.month + r += f"-{self.month}" if self.day: - r += "-%s" % self.day + r += f"-{self.day}" return r def isoformat(self, default=date.max): @@ -260,14 +257,14 @@ def isoformat(self, default=date.max): int(self.day or default.day), ) - def _precise_year(self, lean): + def _precise_year(self, lean: str): # Replace any ambiguous characters in the year string with 0s or 9s if lean == EARLIEST: return int(re.sub(r'[xu]', r'0', self.year)) else: return int(re.sub(r'[xu]', r'9', self.year)) - def _precise_month(self, lean): + def _precise_month(self, lean: str): if self.month and self.month != "uu": try: return int(self.month) @@ -276,7 +273,7 @@ def _precise_month(self, lean): else: return 1 if lean == EARLIEST else 12 - def _precise_day(self, lean): + def _precise_day(self, lean: str): if not self.day or self.day == 'uu': if lean == EARLIEST: return 1 @@ -343,7 +340,7 @@ def __init__(self, lower, upper): self.upper = upper def __str__(self): - return "%s/%s" % (self.lower, self.upper) + return f"{self.lower}/{self.upper}" def _strict_date(self, lean): if lean == EARLIEST: @@ -416,8 +413,8 @@ def __str__(self): def _strict_date(self, lean): if self.date == "open": return dt_to_struct_time(date.today()) - if self.date =="unknown": - return None # depends on the other date + if self.date == "unknown": + return None # depends on the other date return self.date._strict_date(lean) def _get_fuzzy_padding(self, lean): @@ -454,12 +451,12 @@ def __init__(self, year): self.year = year def __str__(self): - return "y%s" % self.year + return f"y{self.year}" def _precise_year(self): return int(self.year) - def _strict_date(self, lean): + def _strict_date(self, lean: str): py = self._precise_year() if lean == EARLIEST: return struct_time( @@ -478,30 +475,26 @@ def __init__(self, year, season, **kwargs): self.day = None def __str__(self): - return "%s-%s" % (self.year, self.season) + return f"{self.year}-{self.season}" def _precise_month(self, lean): rng = appsettings.SEASON_MONTHS_RANGE[int(self.season)] if lean == EARLIEST: return rng[0] - else: - return rng[1] + + return rng[1] # (* ************************** Level 2 *************************** *) class PartialUncertainOrApproximate(Date): - - def set_year(self, y): # Year can be None. - self._year = y - year = property(Date.get_year, set_year) - def __init__( self, year=None, month=None, day=None, - year_ua=False, month_ua = False, day_ua = False, - year_month_ua = False, month_day_ua = False, - ssn=None, season_ua=False, all_ua=False + year_ua: Optional[UA] = None, month_ua: Optional[UA] = None, + day_ua: Optional[UA] = None, year_month_ua: Optional[UA] = None, + month_day_ua: Optional[UA] = None, ssn=None, + season_ua: Optional[UA] = None, all_ua: Optional[UA] = None ): self.year = year self.month = month @@ -520,56 +513,60 @@ def __init__( self.all_ua = all_ua def __str__(self): - if self.season_ua: - return "%s%s" % (self.season, self.season_ua) + return f"{self.season}{self.season_ua}" if self.year_ua: - y = "%s%s" % (self.year, self.year_ua) + y = f"{self.year}{self.year_ua}" else: y = str(self.year) if self.month_ua: - m = "(%s)%s" % (self.month, self.month_ua) + m = f"({self.month}){self.month_ua}" else: m = str(self.month) if self.day: if self.day_ua: - d = "(%s)%s" % (self.day, self.day_ua) + d = f"({self.day}){self.day_ua}" else: d = str(self.day) else: d = None if self.year_month_ua: # year/month approximate. No brackets needed. - ym = "%s-%s%s" % (y, m, self.year_month_ua) + ym = f"{y}-{m}{self.year_month_ua}" if d: - result = "%s-%s" % (ym, d) + result = f"{ym}-{d}" else: result = ym + elif self.month_day_ua: - if self.year_ua: # we don't need the brackets round month and day - result = "%s-%s-%s%s" % (y, m, d, self.month_day_ua) + if self.year_ua: # we don't need the brackets round month and day + result = f"{y}-{m}-{d}{self.month_day_ua}" else: - result = "%s-(%s-%s)%s" % (y, m, d, self.month_day_ua) + result = f"{y}-({m}-{d}){self.month_day_ua}" else: if d: - result = "%s-%s-%s" % (y, m, d) + result = f"{y}-{m}-{d}" else: - result = "%s-%s" % (y, m) + result = f"{y}-{m}" if self.all_ua: - result = "(%s)%s" % (result, self.all_ua) + result = f"({result}){self.all_ua}" return result - def _precise_year(self, lean): + def set_year(self, y): # Year can be None. + self._year = y + year = property(Date.get_year, set_year) + + def _precise_year(self, lean: str): if self.season: return self.season._precise_year(lean) return super(PartialUncertainOrApproximate, self)._precise_year(lean) - def _precise_month(self, lean): + def _precise_month(self, lean: str): if self.season: return self.season._precise_month(lean) return super(PartialUncertainOrApproximate, self)._precise_month(lean) @@ -638,7 +635,7 @@ def __init__(self, lower=None, upper=None): self.upper = upper def __str__(self): - return "%s..%s" % (self.lower or '', self.upper or '') + return f"{self.lower or ''}..{self.upper or ''}" class EarlierConsecutives(Consecutives): @@ -650,41 +647,40 @@ class LaterConsecutives(Consecutives): class OneOfASet(EDTFObject): + def __init__(self, *args): + self.objects = args + @classmethod def parse_action(cls, toks): args = [t for t in toks.asList() if isinstance(t, EDTFObject)] return cls(*args) - def __init__(self, *args): - self.objects = args - def __str__(self): - return "[%s]" % (", ".join([str(o) for o in self.objects])) + return f"[{', '.join([str(o) for o in self.objects])}]" - def _strict_date(self, lean): + def _strict_date(self, lean: str): if lean == LATEST: return max([x._strict_date(lean) for x in self.objects]) - else: - return min([x._strict_date(lean) for x in self.objects]) + + return min([x._strict_date(lean) for x in self.objects]) class MultipleDates(EDTFObject): + def __init__(self, *args): + self.objects = args + @classmethod def parse_action(cls, toks): args = [t for t in toks.asList() if isinstance(t, EDTFObject)] return cls(*args) - def __init__(self, *args): - self.objects = args - def __str__(self): - return "{%s}" % (", ".join([str(o) for o in self.objects])) + return f"{{{', '.join([str(o) for o in self.objects])}}}" def _strict_date(self, lean): if lean == LATEST: return max([x._strict_date(lean) for x in self.objects]) - else: - return min([x._strict_date(lean) for x in self.objects]) + return min([x._strict_date(lean) for x in self.objects]) class MaskedPrecision(Date): @@ -695,12 +691,13 @@ class Level2Interval(Level1Interval): def __init__(self, lower, upper): # Check whether incoming lower/upper values are single-item lists, and # if so take just the first item. This works around what I *think* is a - # bug in the grammer that provides us with single-item lists of + # bug in the grammar that provides us with single-item lists of # `PartialUncertainOrApproximate` items for lower/upper values. if isinstance(lower, (tuple, list)) and len(lower) == 1: self.lower = lower[0] else: self.lower = lower + if isinstance(lower, (tuple, list)) and len(upper) == 1: self.upper = upper[0] else: @@ -718,7 +715,7 @@ def _precise_year(self): def get_year(self): if self.precision: - return '%se%sp%s' % (self.base, self.exponent, self.precision) + return f'{self.base}e{self.exponent}p{self.precision}' else: - return '%se%s' % (self.base, self.exponent) + return f'{self.base}e{self.exponent}' year = property(get_year) diff --git a/edtf/parser/tests.py b/edtf/parser/tests.py index f9dde42..77c2ad3 100644 --- a/edtf/parser/tests.py +++ b/edtf/parser/tests.py @@ -3,10 +3,11 @@ from datetime import date from time import struct_time +from pyparsing import ParseException + from edtf.parser.grammar import parse_edtf as parse from edtf.parser.parser_classes import EDTFObject, TIME_EMPTY_TIME, \ TIME_EMPTY_EXTRAS -from edtf.parser.edtf_exceptions import EDTFParseException # Example object types and attributes. # the first item in each tuple is the input EDTF string, and expected parse result. @@ -192,17 +193,30 @@ None, '', 'not a edtf string', - 'y17e7-12-26', # not implemented - '2016-13-08', # wrong day order - '2016-02-39', # out of range + 'y17e7-12-26', # not implemented + '2016-13-08', # wrong day order + '2016-02-39', # out of range '-0000-01-01', # negative zero year ) class TestParsing(unittest.TestCase): + def iso_to_struct_time(self, iso_date): + """ Convert YYYY-mm-dd date strings to time structs """ + if iso_date[0] == '-': + is_negative = True + iso_date = iso_date[1:] + else: + is_negative = False + y, mo, d = [int(i) for i in iso_date.split('-')] + if is_negative: + y *= -1 + return struct_time( + [y, mo, d] + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS) + def test_non_parsing(self): for i in BAD_EXAMPLES: - self.assertRaises(EDTFParseException, parse, i) + self.assertRaises(ParseException, parse, i) def test_date_values(self): """ @@ -217,13 +231,15 @@ def test_date_values(self): else: o = i - sys.stdout.write("parsing '%s'" % i) + sys.stdout.write(f"parsing '{i}'") f = parse(i) - sys.stdout.write(" => %s()\n" % type(f).__name__) + sys.stdout.write(f" => {type(f).__name__}()\n") self.assertIsInstance(f, EDTFObject) - self.assertEqual(str(f), o) + self.assertEqual(str(f), o, msg=f"Testing {i}") - if len(e) == 5: + if len(e) == 1: + continue + elif len(e) == 5: expected_lower_strict = e[1] expected_upper_strict = e[2] expected_lower_fuzzy = e[3] @@ -243,33 +259,21 @@ def test_date_values(self): expected_upper_strict = e[1] expected_lower_fuzzy = e[1] expected_upper_fuzzy = e[1] - if len(e) == 1: + else: + print(f"Unexpected value {e}; skipping.") continue - def iso_to_struct_time(iso_date): - """ Convert YYYY-mm-dd date strings to time structs """ - if iso_date[0] == '-': - is_negative = True - iso_date = iso_date[1:] - else: - is_negative = False - y, mo, d = [int(i) for i in iso_date.split('-')] - if is_negative: - y *= -1 - return struct_time( - [y, mo, d] + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS) - # Convert string date representations into `struct_time`s - expected_lower_strict = iso_to_struct_time(expected_lower_strict) - expected_upper_strict = iso_to_struct_time(expected_upper_strict) - expected_lower_fuzzy = iso_to_struct_time(expected_lower_fuzzy) - expected_upper_fuzzy = iso_to_struct_time(expected_upper_fuzzy) + exp_lower_str = self.iso_to_struct_time(expected_lower_strict) + exp_upper_str = self.iso_to_struct_time(expected_upper_strict) + exp_lower_fuzz = self.iso_to_struct_time(expected_lower_fuzzy) + exp_upper_fuzz = self.iso_to_struct_time(expected_upper_fuzzy) try: - self.assertEqual(f.lower_strict(), expected_lower_strict) - self.assertEqual(f.upper_strict(), expected_upper_strict) - self.assertEqual(f.lower_fuzzy(), expected_lower_fuzzy) - self.assertEqual(f.upper_fuzzy(), expected_upper_fuzzy) + self.assertEqual(f.lower_strict(), exp_lower_str) + self.assertEqual(f.upper_strict(), exp_upper_str) + self.assertEqual(f.lower_fuzzy(), exp_lower_fuzz) + self.assertEqual(f.upper_fuzzy(), exp_upper_fuzz) except Exception as x: # Write to stdout for manual debugging, I guess sys.stdout.write(str(x)) From 06ab934befb7a665301587134794ddbc50b60964 Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Wed, 24 Jul 2024 11:18:51 +0200 Subject: [PATCH 08/40] Further optimizations --- edtf/natlang/en.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/edtf/natlang/en.py b/edtf/natlang/en.py index 8cb72c4..d7d7b8d 100644 --- a/edtf/natlang/en.py +++ b/edtf/natlang/en.py @@ -1,4 +1,5 @@ """Utilities to derive an EDTF string from an (English) natural language string.""" +import functools from datetime import datetime from typing import Optional @@ -40,15 +41,12 @@ MENTIONS_MONTH = re.compile(r'\bmonth\b.+(in|during)\b') MENTIONS_DAY = re.compile(r'\bday\b.+(in|during)\b') - - # Set of RE rules that will cause us to abort text processing, since we know # the results will be wrong. -REJECT_RULES = ( - re.compile(r'.*dynasty.*'), # Don't parse '23rd Dynasty' to 'uuuu-uu-23' -) +REJECT_RULES = re.compile(r'.*dynasty.*') # Don't parse '23rd Dynasty' to 'uuuu-uu-23' +@functools.lru_cache() def text_to_edtf(text: str) -> Optional[str]: """ Generate EDTF string equivalent of a given natural language date string. @@ -123,7 +121,8 @@ def text_to_edtf(text: str) -> Optional[str]: return result -def text_to_edtf_date(text) -> Optional[str]: +@functools.lru_cache() +def text_to_edtf_date(text: str) -> Optional[str]: """ Return EDTF string equivalent of a given natural language date string. @@ -137,9 +136,8 @@ def text_to_edtf_date(text) -> Optional[str]: t = text.lower() result = '' - for reject_re in REJECT_RULES: - if re.match(reject_re, t): - return None + if re.match(REJECT_RULES, t): + return None # matches on '1800s'. Needs to happen before is_decade. could_be_century: list = re.findall(MIGHT_BE_CENTURY, t) @@ -185,7 +183,6 @@ def text_to_edtf_date(text) -> Optional[str]: else: # try dateutil.parse - try: # parse twice, using different defaults to see what was # parsed and what was guessed. From c9cb56fe7dfcfe3f55ee981106bce7e73e7b7554 Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Mon, 12 Aug 2024 14:27:41 +0200 Subject: [PATCH 09/40] Update gitignore --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index ba74660..4d58675 100644 --- a/.gitignore +++ b/.gitignore @@ -55,3 +55,5 @@ docs/_build/ # PyBuilder target/ +.idea +.DS_Store From 9e51373eea989f4ea306408138b31ce53bdef1ab Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Tue, 13 Aug 2024 15:01:47 +0200 Subject: [PATCH 10/40] Black formatting, updates --- edtf/natlang/en.py | 101 +++++++++++++++++++++++++-------------------- 1 file changed, 57 insertions(+), 44 deletions(-) diff --git a/edtf/natlang/en.py b/edtf/natlang/en.py index d7d7b8d..191199e 100644 --- a/edtf/natlang/en.py +++ b/edtf/natlang/en.py @@ -14,36 +14,42 @@ DEFAULT_DATE_1 = datetime(1234, 1, 1, 0, 0) DEFAULT_DATE_2 = datetime(5678, 10, 10, 0, 0) -SHORT_YEAR_RE = re.compile(r'(-?)([\du])([\dxu])([\dxu])([\dxu])') -LONG_YEAR_RE = re.compile(r'y(-?)([1-9]\d\d\d\d+)') -CENTURY_RE = re.compile(r'(\d{1,2})(c\.?|(st|nd|rd|th) century)\s?(ad|ce|bc|bce)?') -CENTURY_RANGE = re.compile(r'\b(\d\d)(th|st|nd|rd|)-(\d\d)(th|st|nd|rd) [cC]') -CE_RE = re.compile(r'(\d{1,4}) (ad|ce|bc|bce)') -ONE_DIGIT_PARTIAL_FIRST = re.compile(r'\d\D\b') -TWO_DIGIT_PARTIAL_FIRST = re.compile(r'\d\d\b') -PARTIAL_CHECK = re.compile(r'\b\d\d\d\d$') +SHORT_YEAR_RE = re.compile(r"(-?)([\du])([\dxu])([\dxu])([\dxu])") +LONG_YEAR_RE = re.compile(r"y(-?)([1-9]\d\d\d\d+)") +CENTURY_RE = re.compile(r"(\d{1,2})(c\.?|(st|nd|rd|th) century)\s?(ad|ce|bc|bce)?") +CENTURY_RANGE = re.compile(r"\b(\d\d)(th|st|nd|rd|)-(\d\d)(th|st|nd|rd) [cC]") +CE_RE = re.compile(r"(\d{1,4}) (ad|ce|bc|bce)") +ONE_DIGIT_PARTIAL_FIRST = re.compile(r"\d\D\b") +TWO_DIGIT_PARTIAL_FIRST = re.compile(r"\d\d\b") +PARTIAL_CHECK = re.compile(r"\b\d\d\d\d$") SLASH_YEAR = re.compile(r"(\d\d\d\d)/(\d\d\d\d)") BEFORE_CHECK = re.compile(r"\b(?:before|earlier|avant)\b") AFTER_CHECK = re.compile(r"\b(after|since|later|aprés|apres)\b") -APPROX_CHECK = re.compile(r'\b(?:ca?\.? ?\d{4}|circa|approx|approximately|around|about|~\d{3,4})|(?:^~)') +APPROX_CHECK = re.compile( + r"\b(?:ca?\.? ?\d{4}|circa|approx|approximately|around|about|~\d{3,4})|(?:^~)" +) UNCERTAIN_CHECK = re.compile(r"\b(?:uncertain|possibly|maybe|guess|\d{3,4}\?)") -UNCERTAIN_REPL = re.compile(r'(\d{4})\?') -MIGHT_BE_CENTURY = re.compile(r'(\d{2}00)s') -MIGHT_BE_DECADE = re.compile(r'(\d{3}0)s') +UNCERTAIN_REPL = re.compile(r"(\d{4})\?") +MIGHT_BE_CENTURY = re.compile(r"(\d{2}00)s") +MIGHT_BE_DECADE = re.compile(r"(\d{3}0)s") -APPROX_CENTURY_RE = re.compile(r'\b(ca?\.?) ?(\d{1,2})(c\.?|(st|nd|rd|th) century)\s?(ad|ce|bc|bce)?') -UNCERTAIN_CENTURY_RE = re.compile(r'(\d{1,2})(c\.?|(st|nd|rd|th) century)\s?(ad|ce|bc|bce)?\?') +APPROX_CENTURY_RE = re.compile( + r"\b(ca?\.?) ?(\d{1,2})(c\.?|(st|nd|rd|th) century)\s?(ad|ce|bc|bce)?" +) +UNCERTAIN_CENTURY_RE = re.compile( + r"(\d{1,2})(c\.?|(st|nd|rd|th) century)\s?(ad|ce|bc|bce)?\?" +) -APPROX_CE_RE = re.compile(r'\b(ca?\.?) ?(\d{1,4}) (ad|ce|bc|bce)') -UNCERTAIN_CE_RE = re.compile(r'(\d{1,4}) (ad|ce|bc|bce)\?') +APPROX_CE_RE = re.compile(r"\b(ca?\.?) ?(\d{1,4}) (ad|ce|bc|bce)") +UNCERTAIN_CE_RE = re.compile(r"(\d{1,4}) (ad|ce|bc|bce)\?") -MENTIONS_YEAR = re.compile(r'\byear\b.+(in|during)\b') -MENTIONS_MONTH = re.compile(r'\bmonth\b.+(in|during)\b') -MENTIONS_DAY = re.compile(r'\bday\b.+(in|during)\b') +MENTIONS_YEAR = re.compile(r"\byear\b.+(in|during)\b") +MENTIONS_MONTH = re.compile(r"\bmonth\b.+(in|during)\b") +MENTIONS_DAY = re.compile(r"\bday\b.+(in|during)\b") # Set of RE rules that will cause us to abort text processing, since we know # the results will be wrong. -REJECT_RULES = re.compile(r'.*dynasty.*') # Don't parse '23rd Dynasty' to 'uuuu-uu-23' +REJECT_RULES = re.compile(r".*dynasty.*") # Don't parse '23rd Dynasty' to 'uuuu-uu-23' @functools.lru_cache() @@ -57,16 +63,16 @@ def text_to_edtf(text: str) -> Optional[str]: t = text.lower() # try parsing the whole thing - result = text_to_edtf_date(t) + result: Optional[str] = text_to_edtf_date(t) if not result: # split by list delims and move fwd with the first thing that returns a non-empty string. # TODO: assemble multiple dates into a {} or [] structure. for split in [",", ";", "or"]: for list_item in t.split(split): - # try parsing as an interval - split by '-' - toks = list_item.split("-") + toks: list[str] = list_item.split("-") + if len(toks) == 2: d1 = toks[0].strip() d2 = toks[1].strip() @@ -74,10 +80,16 @@ def text_to_edtf(text: str) -> Optional[str]: # match looks from the beginning of the string, search # looks anywhere. - if re.match(ONE_DIGIT_PARTIAL_FIRST, d2): # 1-digit year partial e.g. 1868-9 - if re.search(PARTIAL_CHECK, d1): # TODO: evaluate it and see if it's a year + if re.match( + ONE_DIGIT_PARTIAL_FIRST, d2 + ): # 1-digit year partial e.g. 1868-9 + if re.search( + PARTIAL_CHECK, d1 + ): # TODO: evaluate it and see if it's a year d2 = d1[-4:-1] + d2 - elif re.match(TWO_DIGIT_PARTIAL_FIRST, d2): # 2-digit year partial e.g. 1809-10 + elif re.match( + TWO_DIGIT_PARTIAL_FIRST, d2 + ): # 2-digit year partial e.g. 1809-10 if re.search(PARTIAL_CHECK, d1): d2 = d1[-4:-2] + d2 else: @@ -134,7 +146,7 @@ def text_to_edtf_date(text: str) -> Optional[str]: return None t = text.lower() - result = '' + result: str = "" if re.match(REJECT_RULES, t): return None @@ -143,7 +155,7 @@ def text_to_edtf_date(text: str) -> Optional[str]: could_be_century: list = re.findall(MIGHT_BE_CENTURY, t) # matches on '1800s' and '1910s'. Removes the 's'. # Needs to happen before is_uncertain because e.g. "1860s?" - t, is_decade = re.subn(MIGHT_BE_DECADE, r'\1', t) + t, is_decade = re.subn(MIGHT_BE_DECADE, r"\1", t) # detect approximation signifiers # a few 'circa' abbreviations just before the year @@ -151,7 +163,7 @@ def text_to_edtf_date(text: str) -> Optional[str]: # the word 'circa' anywhere # detect uncertainty signifiers - t, is_uncertain = re.subn(UNCERTAIN_REPL, r'\1', t) + t, is_uncertain = re.subn(UNCERTAIN_REPL, r"\1", t) is_uncertain = is_uncertain or re.findall(UNCERTAIN_CHECK, t) # detect century forms @@ -191,7 +203,7 @@ def text_to_edtf_date(text: str) -> Optional[str]: dayfirst=appsettings.DAY_FIRST, yearfirst=False, fuzzy=True, # force a match, even if it's default date - default=DEFAULT_DATE_1 + default=DEFAULT_DATE_1, ) dt2 = parse( @@ -199,14 +211,13 @@ def text_to_edtf_date(text: str) -> Optional[str]: dayfirst=appsettings.DAY_FIRST, yearfirst=False, fuzzy=True, # force a match, even if it's default date - default=DEFAULT_DATE_2 + default=DEFAULT_DATE_2, ) except ValueError: return None - if dt1.date() == DEFAULT_DATE_1.date() and \ - dt2.date() == DEFAULT_DATE_2.date(): + if dt1.date() == DEFAULT_DATE_1.date() and dt2.date() == DEFAULT_DATE_2.date(): # couldn't parse anything - defaults are untouched. return None @@ -223,12 +234,12 @@ def text_to_edtf_date(text: str) -> Optional[str]: # approximate/uncertain markers to decide whether we treat it as # a century or a decade. if i == 2 and could_be_century and not (is_approximate or is_uncertain): - result += 'x' + result += "x" elif i == 3 and is_decade: if mentions_year: - result += 'u' # year precision + result += "X" # year precision else: - result += 'x' # decade precision + result += "x" # decade precision elif date1[i] == date2[i]: # since both attempts at parsing produced the same result # it must be parsed value, not a default @@ -236,12 +247,12 @@ def text_to_edtf_date(text: str) -> Optional[str]: else: # different values were produced, meaning that it's likely # a default. Use 'unspecified' - result += "u" + result += "X" # strip off unknown chars from end of string - except the first 4 for i in reversed(range(len(result))): - if result[i] not in ('u', 'x', '-'): + if result[i] not in ("X", "-"): smallest_length = 4 if mentions_month: @@ -265,14 +276,16 @@ def text_to_edtf_date(text: str) -> Optional[str]: # end dateutil post-parsing - if is_uncertain: - result += "?" - - if is_approximate: - result += "~" + if is_uncertain and is_approximate: + result += "%" + else: + if is_uncertain: + result += "?" + if is_approximate: + result += "~" # weed out bad parses - if result.startswith("uu-uu"): + if result.startswith("XX-XX"): return None return result From 1aa53cfb2d4e0a2a3c284ec20db60f841b88a7f9 Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Tue, 13 Aug 2024 15:03:16 +0200 Subject: [PATCH 11/40] Update imports --- edtf/natlang/en.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/edtf/natlang/en.py b/edtf/natlang/en.py index 191199e..ba192e8 100644 --- a/edtf/natlang/en.py +++ b/edtf/natlang/en.py @@ -1,12 +1,12 @@ """Utilities to derive an EDTF string from an (English) natural language string.""" import functools +import re from datetime import datetime from typing import Optional from dateutil.parser import parse -import re -from edtf import appsettings +from edtf import appsettings # two dates where every digit of an ISO date representation is different, # and one is in the past and one is in the future. From 8c4f9685bc31224bcd0efcf811485f2e3f34e292 Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Tue, 13 Aug 2024 16:48:01 +0200 Subject: [PATCH 12/40] Merge fixes --- edtf/natlang/en.py | 18 ++++++++++-------- edtf/parser/parser_classes.py | 1 + 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/edtf/natlang/en.py b/edtf/natlang/en.py index ba192e8..49b04f3 100644 --- a/edtf/natlang/en.py +++ b/edtf/natlang/en.py @@ -4,7 +4,7 @@ from datetime import datetime from typing import Optional -from dateutil.parser import parse +from dateutil.parser import ParserError, parse from edtf import appsettings @@ -126,9 +126,9 @@ def text_to_edtf(text: str) -> Optional[str]: is_after = re.findall(AFTER_CHECK, t) if is_before: - result = f"unknown/{result}" + result = f"/{result}" elif is_after: - result = f"{result}/unknown" + result = f"{result}/" return result @@ -172,7 +172,7 @@ def text_to_edtf_date(text: str) -> Optional[str]: # detect CE/BCE year form is_ce = re.findall(CE_RE, t) if is_century: - result = "%02dxx" % (int(is_century[0][0]) - 1,) + result = "%02dXX" % (int(is_century[0][0]) - 1,) is_approximate = is_approximate or re.findall(APPROX_CENTURY_RE, t) is_uncertain = is_uncertain or re.findall(UNCERTAIN_CENTURY_RE, t) @@ -214,8 +214,10 @@ def text_to_edtf_date(text: str) -> Optional[str]: default=DEFAULT_DATE_2, ) - except ValueError: - return None + except ParserError: + return + except Exception: + return if dt1.date() == DEFAULT_DATE_1.date() and dt2.date() == DEFAULT_DATE_2.date(): # couldn't parse anything - defaults are untouched. @@ -234,12 +236,12 @@ def text_to_edtf_date(text: str) -> Optional[str]: # approximate/uncertain markers to decide whether we treat it as # a century or a decade. if i == 2 and could_be_century and not (is_approximate or is_uncertain): - result += "x" + result += "X" elif i == 3 and is_decade: if mentions_year: result += "X" # year precision else: - result += "x" # decade precision + result += "X" # decade precision elif date1[i] == date2[i]: # since both attempts at parsing produced the same result # it must be parsed value, not a default diff --git a/edtf/parser/parser_classes.py b/edtf/parser/parser_classes.py index d103660..eada1f9 100644 --- a/edtf/parser/parser_classes.py +++ b/edtf/parser/parser_classes.py @@ -4,6 +4,7 @@ from datetime import date, datetime from operator import add, sub from time import struct_time +from typing import Optional from dateutil.relativedelta import relativedelta From 6f08bce95cb583f2825353cbe8ae6a1de1c47df7 Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Tue, 13 Aug 2024 16:55:59 +0200 Subject: [PATCH 13/40] ruff formatting --- edtf/natlang/en.py | 5 +++-- edtf/parser/parser_classes.py | 9 ++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/edtf/natlang/en.py b/edtf/natlang/en.py index 49b04f3..97230db 100644 --- a/edtf/natlang/en.py +++ b/edtf/natlang/en.py @@ -1,4 +1,5 @@ """Utilities to derive an EDTF string from an (English) natural language string.""" + import functools import re from datetime import datetime @@ -52,7 +53,7 @@ REJECT_RULES = re.compile(r".*dynasty.*") # Don't parse '23rd Dynasty' to 'uuuu-uu-23' -@functools.lru_cache() +@functools.lru_cache def text_to_edtf(text: str) -> Optional[str]: """ Generate EDTF string equivalent of a given natural language date string. @@ -133,7 +134,7 @@ def text_to_edtf(text: str) -> Optional[str]: return result -@functools.lru_cache() +@functools.lru_cache def text_to_edtf_date(text: str) -> Optional[str]: """ Return EDTF string equivalent of a given natural language date string. diff --git a/edtf/parser/parser_classes.py b/edtf/parser/parser_classes.py index eada1f9..ad690fb 100644 --- a/edtf/parser/parser_classes.py +++ b/edtf/parser/parser_classes.py @@ -98,10 +98,6 @@ class EDTFObject: parser = None - def __init__(self, *args, **kwargs): - errmsg: str = f"{type(self).__name__}.__init__(*{args}, **{kwargs})" - raise NotImplementedError(f"{errmsg} is not implemented.") - @classmethod def set_parser(cls, p): cls.parser = p @@ -288,6 +284,7 @@ def set_year(self, y: int): def get_year(self) -> int: return self._year + year = property(get_year, set_year) def set_month(self, m: Optional[int]): @@ -297,6 +294,7 @@ def set_month(self, m: Optional[int]): def get_month(self) -> Optional[int]: return self._month + month = property(get_month, set_month) def __str__(self): @@ -932,8 +930,9 @@ def __str__(self): return result - def set_year(self, y): # Year can be None. + def set_year(self, y): # Year can be None. self._year = y + year = property(Date.get_year, set_year) def _precise_year(self, lean: str): From 973ccf4cabcd21cc0d7af5e2d1c8bb86992c65e3 Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Tue, 13 Aug 2024 17:27:15 +0200 Subject: [PATCH 14/40] Remove accidentally committed poetry file --- poetry.lock | 45 --------------------------------------------- 1 file changed, 45 deletions(-) delete mode 100644 poetry.lock diff --git a/poetry.lock b/poetry.lock deleted file mode 100644 index c4b40b6..0000000 --- a/poetry.lock +++ /dev/null @@ -1,45 +0,0 @@ -# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. - -[[package]] -name = "pyparsing" -version = "3.1.2" -description = "pyparsing module - Classes and methods to define and execute parsing grammars" -optional = false -python-versions = ">=3.6.8" -files = [ - {file = "pyparsing-3.1.2-py3-none-any.whl", hash = "sha256:f9db75911801ed778fe61bb643079ff86601aca99fcae6345aa67292038fb742"}, - {file = "pyparsing-3.1.2.tar.gz", hash = "sha256:a1bac0ce561155ecc3ed78ca94d3c9378656ad4c94c1270de543f621420f94ad"}, -] - -[package.extras] -diagrams = ["jinja2", "railroad-diagrams"] - -[[package]] -name = "python-dateutil" -version = "2.9.0.post0" -description = "Extensions to the standard Python datetime module" -optional = false -python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" -files = [ - {file = "python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3"}, - {file = "python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427"}, -] - -[package.dependencies] -six = ">=1.5" - -[[package]] -name = "six" -version = "1.16.0" -description = "Python 2 and 3 compatibility utilities" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" -files = [ - {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, - {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, -] - -[metadata] -lock-version = "2.0" -python-versions = "^3.9" -content-hash = "e6be32f86f1a6af0695f6846b57ed289e015b5634c7f574c45800095a84e2200" From ee450a55a74069daf44da6c476a823dc879f6e78 Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Wed, 14 Aug 2024 13:14:07 +0200 Subject: [PATCH 15/40] Fixed: f-string formatting Also added Andrew Hankinson to the authors list in pyproject.toml --- edtf/natlang/en.py | 2 +- pyproject.toml | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/edtf/natlang/en.py b/edtf/natlang/en.py index 97230db..d57bb82 100644 --- a/edtf/natlang/en.py +++ b/edtf/natlang/en.py @@ -104,7 +104,7 @@ def text_to_edtf(text: str) -> Optional[str]: r2 = text_to_edtf_date(d2) if r1 and r2: - result = r1 + "/" + r2 + result = f"{r1}/{r2}" return result # is it an either/or year "1838/1862" - that has a different diff --git a/pyproject.toml b/pyproject.toml index b48c3f7..2d050c2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,7 +16,8 @@ authors = [ { name = "Mark Finger" }, { name = "Sabine Müller" }, { name = "Cole Crawford" }, - { name = "Klaus Rettinghaus" } + { name = "Klaus Rettinghaus" }, + { name = "Andrew Hankinson", email = "andrew.hankinson@rism.digital" }, ] maintainers = [ { name = "The Interaction Consortium", email = "studio@interaction.net.au" } From 46bdce6bd97956088e932ba1ca359bac71ca3f06 Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Thu, 12 Sep 2024 15:40:50 +0200 Subject: [PATCH 16/40] Fixed: return type of statement --- edtf/parser/parser_classes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/edtf/parser/parser_classes.py b/edtf/parser/parser_classes.py index ad690fb..c334ee9 100644 --- a/edtf/parser/parser_classes.py +++ b/edtf/parser/parser_classes.py @@ -28,7 +28,7 @@ PRECISION_DAY = "day" -def days_in_month(year: int, month: int) -> dict: +def days_in_month(year: int, month: int) -> int: """ Return the number of days in the given year and month, where month is 1=January to 12=December, and respecting leap years as identified by From 656f8ad900ddd3d02ead2fce2eb9575c7d049025 Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Thu, 12 Sep 2024 15:44:38 +0200 Subject: [PATCH 17/40] Updated parser classes I've had a pass at the Parser Classes file, but there are a lot of problems still to be sorted out. I've added return types and argument types whereever it makes sense. The "UncertainOrApproximate" class is a hot mess. There are boolean values with property and method calls associated with them, and I would be surprised if it actually works. However, it doesn't seem to be tested or implemented, so I can't figure out where to go from here. --- edtf/parser/parser_classes.py | 192 +++++++++++++++++----------------- 1 file changed, 94 insertions(+), 98 deletions(-) diff --git a/edtf/parser/parser_classes.py b/edtf/parser/parser_classes.py index c334ee9..eb9fac5 100644 --- a/edtf/parser/parser_classes.py +++ b/edtf/parser/parser_classes.py @@ -126,7 +126,7 @@ def __init__(self, *args, **kwargs): def __str__(self): raise NotImplementedError - def _strict_date(self, lean: str): + def _strict_date(self, lean: str = EARLIEST): raise NotImplementedError def lower_strict(self): @@ -141,30 +141,31 @@ def _get_fuzzy_padding(self, lean: str): """ return relativedelta(0) - def get_is_approximate(self): + def get_is_approximate(self) -> bool: return getattr(self, "_is_approximate", False) - def set_is_approximate(self, val): + def set_is_approximate(self, val: bool) -> None: self._is_approximate = val - is_approximate = property(get_is_approximate, set_is_approximate) + is_approximate = property(get_is_approximate, set_is_approximate) # noqa - def get_is_uncertain(self): + def get_is_uncertain(self) -> bool: return getattr(self, "_is_uncertain", False) - def set_is_uncertain(self, val): + def set_is_uncertain(self, val: bool) -> None: self._is_uncertain = val - is_uncertain = property(get_is_uncertain, set_is_uncertain) + is_uncertain = property(get_is_uncertain, set_is_uncertain) # noqa - def get_is_uncertain_and_approximate(self): + def get_is_uncertain_and_approximate(self) -> bool: return getattr(self, "_uncertain_and_approximate", False) - def set_is_uncertain_and_approximate(self, val): + def set_is_uncertain_and_approximate(self, val: bool) -> None: self._uncertain_and_approximate = val is_uncertain_and_approximate = property( - get_is_uncertain_and_approximate, set_is_uncertain_and_approximate + get_is_uncertain_and_approximate, # noqa + set_is_uncertain_and_approximate, # noqa ) def lower_fuzzy(self): @@ -242,76 +243,71 @@ def __le__(self, other): class Date(EDTFObject): - def set_year(self, y): - if y is None: - raise AttributeError("Year must not be None") - self._year = y - - def get_year(self): - return self._year - - year = property(get_year, set_year) - - def set_month(self, m): - self._month = m - if m is None: - self.day = None - - def get_month(self): - return self._month - - month = property(get_month, set_month) - - def __init__( - self, year=None, month=None, day=None, significant_digits=None, **kwargs + def __init__( # noqa + self, + year: Optional[str] = None, + month: Optional[str] = None, + day: Optional[str] = None, + significant_digits=None, + **kwargs, ): for param in ("date", "lower", "upper"): if param in kwargs: self.__init__(**kwargs[param]) return - self.year = year # Year is required, but sometimes passed in as a 'date' dict. - self.month = month - self.day = day + self._year = year # Year is required, but sometimes passed in as a 'date' dict. + self._month = month + self._day = day self.significant_digits = ( int(significant_digits) if significant_digits else None ) - def set_year(self, y: int): + def set_year(self, y: str): if y is None: raise AttributeError("Year must not be None") self._year = y - def get_year(self) -> int: + def get_year(self) -> str: return self._year - year = property(get_year, set_year) + year = property(get_year, set_year) # noqa - def set_month(self, m: Optional[int]): + def set_month(self, m: Optional[str]): self._month = m if m is None: - self.day = None + self._day = None - def get_month(self) -> Optional[int]: + def get_month(self) -> Optional[str]: return self._month - month = property(get_month, set_month) + month = property(get_month, set_month) # noqa + + def set_day(self, d: Optional[str]): + self._day = d + if d is None: + self._day = None + + def get_day(self) -> Optional[str]: + return self._day + + day = property(get_day, set_day) # noqa def __str__(self): - r = self.year - if self.month: - r += f"-{self.month}" - if self.day: - r += f"-{self.day}" + r = self._year + if self._month: + r += f"-{self._month}" + if self._day: + r += f"-{self._day}" if self.significant_digits: r += f"S{self.significant_digits}" return r def isoformat(self, default=date.max): return "%s-%02d-%02d" % ( - self.year, - int(self.month or default.month), - int(self.day or default.day), + self._year, + int(self._month or default.month), + int(self._day or default.day), ) def lower_fuzzy(self): @@ -320,10 +316,10 @@ def lower_fuzzy(self): sub, self.lower_strict(), self._get_fuzzy_padding(EARLIEST) ) else: - total_digits = len(self.year) + total_digits = len(self._year) insignificant_digits = total_digits - self.significant_digits lower_year = ( - int(self.year) + int(self._year) // (10**insignificant_digits) * (10**insignificant_digits) ) @@ -335,9 +331,9 @@ def upper_fuzzy(self): add, self.upper_strict(), self._get_fuzzy_padding(LATEST) ) else: - total_digits = len(self.year) + total_digits = len(self._year) insignificant_digits = total_digits - self.significant_digits - upper_year = (int(self.year) // (10**insignificant_digits) + 1) * ( + upper_year = (int(self._year) // (10**insignificant_digits) + 1) * ( 10**insignificant_digits ) - 1 return struct_time( @@ -347,23 +343,23 @@ def upper_fuzzy(self): def _precise_year(self, lean): # Replace any ambiguous characters in the year string with 0s or 9s if lean == EARLIEST: - return int(re.sub(r"X", r"0", self.year)) + return int(re.sub(r"X", r"0", self._year)) else: - return int(re.sub(r"X", r"9", self.year)) + return int(re.sub(r"X", r"9", self._year)) def _precise_month(self, lean): - if self.month and self.month != "XX": + if self._month and self._month != "XX": try: - return int(self.month) + return int(self._month) except ValueError as err: raise ValueError( - f"Couldn't convert {self.month} to int (in {self})" + f"Couldn't convert {self._month} to int (in {self})" ) from err else: return 1 if lean == EARLIEST else 12 def _precise_day(self, lean): - if not self.day or self.day == "XX": + if not self._day or self._day == "XX": if lean == EARLIEST: return 1 else: @@ -371,9 +367,9 @@ def _precise_day(self, lean): self._precise_year(LATEST), self._precise_month(LATEST) ) else: - return int(self.day) + return int(self._day) - def _strict_date(self, lean): + def _strict_date(self, lean: str = EARLIEST): """ Return a `time.struct_time` representation of the date. """ @@ -389,9 +385,9 @@ def _strict_date(self, lean): @property def precision(self): - if self.day: + if self._day: return PRECISION_DAY - if self.month: + if self._month: return PRECISION_MONTH return PRECISION_YEAR @@ -400,7 +396,7 @@ def estimated(self): class DateAndTime(EDTFObject): - def __init__(self, date, time): + def __init__(self, date, time): # noqa: super raises not implemented self.date = date self.time = time @@ -410,7 +406,7 @@ def __str__(self): def isoformat(self): return self.date.isoformat() + "T" + self.time - def _strict_date(self, lean): + def _strict_date(self, lean: str = EARLIEST): return self.date._strict_date(lean) def __eq__(self, other): @@ -429,14 +425,14 @@ def __ne__(self, other): class Interval(EDTFObject): - def __init__(self, lower, upper): + def __init__(self, lower, upper): # noqa: super() raises not implemented self.lower = lower self.upper = upper def __str__(self): return f"{self.lower}/{self.upper}" - def _strict_date(self, lean): + def _strict_date(self, lean: str = EARLIEST): if lean == EARLIEST: r = self.lower._strict_date(lean) else: @@ -459,7 +455,7 @@ def parse_action(cls, toks): args = toks.asList() return cls(*args) - def __init__(self, *args): + def __init__(self, *args): # noqa: super() raises not implemented if len(args) != 1: raise AssertionError("UA must have exactly one argument") ua = args[0] @@ -488,7 +484,7 @@ def _get_multiplier(self): class UncertainOrApproximate(EDTFObject): - def __init__(self, date, ua): + def __init__(self, date, ua): # noqa: super() raises not implemented self.date = date self.ua = ua self.is_uncertain = ua.is_uncertain if ua else False @@ -503,7 +499,7 @@ def __str__(self): else: return str(self.date) - def _strict_date(self, lean): + def _strict_date(self, lean: str = EARLIEST): return self.date._strict_date(lean) def _get_fuzzy_padding(self, lean): @@ -532,7 +528,7 @@ def _get_fuzzy_padding(self, lean): class UnspecifiedIntervalSection(EDTFObject): - def __init__(self, sectionOpen=False, other_section_element=None): + def __init__(self, sectionOpen=False, other_section_element=None): # noqa: super() raises not implemented if sectionOpen: self.is_open = True self.is_unknown = False @@ -547,14 +543,17 @@ def __str__(self): else: return ".." - def _strict_date(self, lean): + def _strict_date(self, lean: str = EARLIEST): + if lean not in (EARLIEST, LATEST): + raise ValueError("lean must be one of EARLIEST or LATEST") + if lean == EARLIEST: if self.is_unknown: upper = self.other._strict_date(LATEST) return apply_delta(sub, upper, appsettings.DELTA_IF_UNKNOWN) else: return -math.inf - else: + elif lean == LATEST: if self.is_unknown: lower = self.other._strict_date(EARLIEST) return apply_delta(add, lower, appsettings.DELTA_IF_UNKNOWN) @@ -717,7 +716,7 @@ def precision(self): class Level1Interval(Interval): - def __init__(self, lower=None, upper=None): + def __init__(self, lower: Optional[dict] = None, upper: Optional[dict] = None): # noqa if lower: if lower["date"] == "..": self.lower = UnspecifiedIntervalSection( @@ -740,8 +739,10 @@ def __init__(self, lower=None, upper=None): self.upper = UnspecifiedIntervalSection( False, UncertainOrApproximate(**lower) ) - self.is_approximate = self.lower.is_approximate or self.upper.is_approximate - self.is_uncertain = self.lower.is_uncertain or self.upper.is_uncertain + self.is_approximate: bool = ( + self.lower.is_approximate or self.upper.is_approximate + ) + self.is_uncertain: bool = self.lower.is_uncertain or self.upper.is_uncertain self.is_uncertain_and_approximate = ( self.lower.is_uncertain_and_approximate or self.upper.is_uncertain_and_approximate @@ -755,7 +756,7 @@ def _get_fuzzy_padding(self, lean): class LongYear(EDTFObject): - def __init__(self, year, significant_digits=None): + def __init__(self, year: str, significant_digits: Optional[str] = None): # noqa self.year = year self.significant_digits = ( int(significant_digits) if significant_digits else None @@ -770,7 +771,7 @@ def __str__(self): def _precise_year(self): return int(self.year) - def _strict_date(self, lean: str): + def _strict_date(self, lean: str = EARLIEST): py = self._precise_year() if lean == EARLIEST: return struct_time([py, 1, 1] + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS) @@ -818,7 +819,7 @@ def upper_fuzzy(self): class Season(Date): - def __init__(self, year, season, **kwargs): + def __init__(self, year, season, **kwargs): # noqa self.year = year self.season = season # use season to look up month # day isn't part of the 'season' spec, but it helps the inherited @@ -840,12 +841,7 @@ def _precise_month(self, lean): class PartialUncertainOrApproximate(Date): - def set_year(self, y): # Year can be None. - self._year = y - - year = property(Date.get_year, set_year) - - def __init__( + def __init__( # noqa self, year=None, month=None, @@ -933,7 +929,7 @@ def __str__(self): def set_year(self, y): # Year can be None. self._year = y - year = property(Date.get_year, set_year) + year = property(Date.get_year, set_year) # noqa def _precise_year(self, lean: str): if self.season: @@ -1018,7 +1014,7 @@ class PartialUnspecified(Unspecified): class Consecutives(Interval): # Treating Consecutive ranges as intervals where one bound is optional - def __init__(self, lower=None, upper=None): + def __init__(self, lower=None, upper=None): # noqa if lower and not isinstance(lower, EDTFObject): self.lower = Date.parse(lower) else: @@ -1044,7 +1040,7 @@ def __str__(self): class OneOfASet(EDTFObject): - def __init__(self, *args): + def __init__(self, *args): # noqa self.objects = args @classmethod @@ -1053,9 +1049,9 @@ def parse_action(cls, toks): return cls(*args) def __str__(self): - return "[{}]".format(", ".join([str(o) for o in self.objects])) + return f"[{", ".join([str(o) for o in self.objects])}]" - def _strict_date(self, lean): + def _strict_date(self, lean: str = EARLIEST): strict_dates = [x._strict_date(lean) for x in self.objects] # Accounting for possible 'inf' and '-inf' values if lean == LATEST: @@ -1077,7 +1073,7 @@ def _strict_date(self, lean): class MultipleDates(EDTFObject): - def __init__(self, *args): + def __init__(self, *args): # noqa self.objects = args @classmethod @@ -1086,16 +1082,16 @@ def parse_action(cls, toks): return cls(*args) def __str__(self): - return "{{{}}}".format(", ".join([str(o) for o in self.objects])) + return f"{{{", ".join([str(o) for o in self.objects])}}}" - def _strict_date(self, lean): + def _strict_date(self, lean: str = EARLIEST): if lean == LATEST: return max([x._strict_date(lean) for x in self.objects]) return min([x._strict_date(lean) for x in self.objects]) class Level2Interval(Level1Interval): - def __init__(self, lower, upper): + def __init__(self, lower, upper): # noqa # Check whether incoming lower/upper values are single-item lists, and # if so take just the first item. This works around what I *think* is a # bug in the grammar that provides us with single-item lists of @@ -1122,7 +1118,7 @@ class Level2Season(Season): class ExponentialYear(LongYear): - def __init__(self, base, exponent, significant_digits=None): + def __init__(self, base, exponent, significant_digits=None): # noqa self.base = base self.exponent = exponent self.significant_digits = ( @@ -1132,13 +1128,13 @@ def __init__(self, base, exponent, significant_digits=None): def _precise_year(self): return int(self.base) * 10 ** int(self.exponent) - def get_year(self): + def get_year(self) -> str: if self.significant_digits: return f"{self.base}E{self.exponent}S{self.significant_digits}" else: return f"{self.base}E{self.exponent}" - year = property(get_year) + year = property(get_year) # noqa def estimated(self): return self._precise_year() From add79bd311c2af7698043deff7f992535cb22aed Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Thu, 12 Sep 2024 15:45:39 +0200 Subject: [PATCH 18/40] Fixed: Remove SHORT_YEAR_RE This wasn't actually used anywhere! Also removed a redundant regex group --- edtf/natlang/en.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/edtf/natlang/en.py b/edtf/natlang/en.py index d57bb82..9cee578 100644 --- a/edtf/natlang/en.py +++ b/edtf/natlang/en.py @@ -15,7 +15,6 @@ DEFAULT_DATE_1 = datetime(1234, 1, 1, 0, 0) DEFAULT_DATE_2 = datetime(5678, 10, 10, 0, 0) -SHORT_YEAR_RE = re.compile(r"(-?)([\du])([\dxu])([\dxu])([\dxu])") LONG_YEAR_RE = re.compile(r"y(-?)([1-9]\d\d\d\d+)") CENTURY_RE = re.compile(r"(\d{1,2})(c\.?|(st|nd|rd|th) century)\s?(ad|ce|bc|bce)?") CENTURY_RANGE = re.compile(r"\b(\d\d)(th|st|nd|rd|)-(\d\d)(th|st|nd|rd) [cC]") @@ -27,7 +26,7 @@ BEFORE_CHECK = re.compile(r"\b(?:before|earlier|avant)\b") AFTER_CHECK = re.compile(r"\b(after|since|later|aprés|apres)\b") APPROX_CHECK = re.compile( - r"\b(?:ca?\.? ?\d{4}|circa|approx|approximately|around|about|~\d{3,4})|(?:^~)" + r"\b(?:ca?\.? ?\d{4}|circa|approx|approximately|around|about|~\d{3,4})|^~" ) UNCERTAIN_CHECK = re.compile(r"\b(?:uncertain|possibly|maybe|guess|\d{3,4}\?)") UNCERTAIN_REPL = re.compile(r"(\d{4})\?") From fee0b648e2344169aeee2b35068c670afc7325a7 Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Thu, 12 Sep 2024 15:50:51 +0200 Subject: [PATCH 19/40] Problem with f-string --- edtf/parser/parser_classes.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/edtf/parser/parser_classes.py b/edtf/parser/parser_classes.py index eb9fac5..0334738 100644 --- a/edtf/parser/parser_classes.py +++ b/edtf/parser/parser_classes.py @@ -1049,7 +1049,8 @@ def parse_action(cls, toks): return cls(*args) def __str__(self): - return f"[{", ".join([str(o) for o in self.objects])}]" + repr: str = ", ".join([str(o) for o in self.objects]) + return f"[{repr}]" def _strict_date(self, lean: str = EARLIEST): strict_dates = [x._strict_date(lean) for x in self.objects] From 89f36924adf59d271aadc3df6ac3ea1454ccb093 Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Thu, 12 Sep 2024 15:59:23 +0200 Subject: [PATCH 20/40] Another f-string fix --- edtf/parser/parser_classes.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/edtf/parser/parser_classes.py b/edtf/parser/parser_classes.py index 0334738..14728f0 100644 --- a/edtf/parser/parser_classes.py +++ b/edtf/parser/parser_classes.py @@ -1083,7 +1083,8 @@ def parse_action(cls, toks): return cls(*args) def __str__(self): - return f"{{{", ".join([str(o) for o in self.objects])}}}" + repr: str = ", ".join([str(o) for o in self.objects]) + return f"{{{repr}}}" def _strict_date(self, lean: str = EARLIEST): if lean == LATEST: From 9da1d94436e124a337fd81133cee5ac48b85cea5 Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Thu, 12 Sep 2024 16:29:46 +0200 Subject: [PATCH 21/40] Fixed: pyproject errors --- pyproject.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 2d050c2..8826b99 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,8 @@ [project] name = "edtf" version = "5.0.0" +license = { file = "LICENSE" } +keywords = ['edtf'] dependencies = [ "python-dateutil", "pyparsing", From 95b83aab6218d5f4b1ef445f7c10970789b683c8 Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Thu, 16 Jan 2025 17:17:39 +0100 Subject: [PATCH 22/40] Testing without lru_cache --- edtf/natlang/en.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/edtf/natlang/en.py b/edtf/natlang/en.py index 9cee578..6e77e5b 100644 --- a/edtf/natlang/en.py +++ b/edtf/natlang/en.py @@ -1,6 +1,5 @@ """Utilities to derive an EDTF string from an (English) natural language string.""" -import functools import re from datetime import datetime from typing import Optional @@ -52,7 +51,7 @@ REJECT_RULES = re.compile(r".*dynasty.*") # Don't parse '23rd Dynasty' to 'uuuu-uu-23' -@functools.lru_cache +# @functools.lru_cache def text_to_edtf(text: str) -> Optional[str]: """ Generate EDTF string equivalent of a given natural language date string. @@ -133,7 +132,7 @@ def text_to_edtf(text: str) -> Optional[str]: return result -@functools.lru_cache +# @functools.lru_cache def text_to_edtf_date(text: str) -> Optional[str]: """ Return EDTF string equivalent of a given natural language date string. From 6262a38aab5ef18f9f109455c2534087fef943b6 Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Thu, 16 Jan 2025 17:28:17 +0100 Subject: [PATCH 23/40] Fixed: New ruff rules --- edtf/natlang/en.py | 4 ++-- edtf/parser/grammar.py | 18 +++++++++--------- edtf/parser/parser_classes.py | 6 +----- 3 files changed, 12 insertions(+), 16 deletions(-) diff --git a/edtf/natlang/en.py b/edtf/natlang/en.py index 6e77e5b..62ccca5 100644 --- a/edtf/natlang/en.py +++ b/edtf/natlang/en.py @@ -171,7 +171,7 @@ def text_to_edtf_date(text: str) -> Optional[str]: # detect CE/BCE year form is_ce = re.findall(CE_RE, t) if is_century: - result = "%02dXX" % (int(is_century[0][0]) - 1,) + result = f"{int(is_century[0][0]) - 1:02d}XX" is_approximate = is_approximate or re.findall(APPROX_CENTURY_RE, t) is_uncertain = is_uncertain or re.findall(UNCERTAIN_CENTURY_RE, t) @@ -182,7 +182,7 @@ def text_to_edtf_date(text: str) -> Optional[str]: pass elif is_ce: - result = "%04d" % (int(is_ce[0][0])) + result = f"{int(is_ce[0][0]):04d}" is_approximate = is_approximate or re.findall(APPROX_CE_RE, t) is_uncertain = is_uncertain or re.findall(UNCERTAIN_CE_RE, t) diff --git a/edtf/parser/grammar.py b/edtf/parser/grammar.py index beabf52..b11a3c8 100644 --- a/edtf/parser/grammar.py +++ b/edtf/parser/grammar.py @@ -49,15 +49,15 @@ Unspecified, ) -oneThru12 = oneOf(["%.2d" % i for i in range(1, 13)]) -oneThru13 = oneOf(["%.2d" % i for i in range(1, 14)]) -oneThru23 = oneOf(["%.2d" % i for i in range(1, 24)]) -zeroThru23 = oneOf(["%.2d" % i for i in range(0, 24)]) -oneThru29 = oneOf(["%.2d" % i for i in range(1, 30)]) -oneThru30 = oneOf(["%.2d" % i for i in range(1, 31)]) -oneThru31 = oneOf(["%.2d" % i for i in range(1, 32)]) -oneThru59 = oneOf(["%.2d" % i for i in range(1, 60)]) -zeroThru59 = oneOf(["%.2d" % i for i in range(0, 60)]) +oneThru12 = oneOf([f"{i:.2d}" for i in range(1, 13)]) +oneThru13 = oneOf([f"{i:.2d}" for i in range(1, 14)]) +oneThru23 = oneOf([f"{i:.2d}" for i in range(1, 24)]) +zeroThru23 = oneOf([f"{i:.2d}" for i in range(0, 24)]) +oneThru29 = oneOf([f"{i:.2d}" for i in range(1, 30)]) +oneThru30 = oneOf([f"{i:.2d}" for i in range(1, 31)]) +oneThru31 = oneOf([f"{i:.2d}" for i in range(1, 32)]) +oneThru59 = oneOf([f"{i:.2d}" for i in range(1, 60)]) +zeroThru59 = oneOf([f"{i:.2d}" for i in range(0, 60)]) digit = Word(nums, exact=1) positiveDigit = Word(nums, exact=1, excludeChars="0") diff --git a/edtf/parser/parser_classes.py b/edtf/parser/parser_classes.py index 14728f0..9439a80 100644 --- a/edtf/parser/parser_classes.py +++ b/edtf/parser/parser_classes.py @@ -304,11 +304,7 @@ def __str__(self): return r def isoformat(self, default=date.max): - return "%s-%02d-%02d" % ( - self._year, - int(self._month or default.month), - int(self._day or default.day), - ) + return f"{self._year}-{int(self._month or default.month):02d}-{int(self._day or default.day):02d}" def lower_fuzzy(self): if not hasattr(self, "significant_digits") or not self.significant_digits: From 8fbce49ce8105cab5ae52f8e0d8fa4f94b042a49 Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Thu, 16 Jan 2025 17:34:42 +0100 Subject: [PATCH 24/40] Fixed formatting --- edtf/natlang/tests.py | 6 ++-- edtf/parser/tests.py | 66 +++++++++++++++++++++---------------------- 2 files changed, 36 insertions(+), 36 deletions(-) diff --git a/edtf/natlang/tests.py b/edtf/natlang/tests.py index d2c43a5..e0acaad 100644 --- a/edtf/natlang/tests.py +++ b/edtf/natlang/tests.py @@ -182,9 +182,9 @@ def test_natlang(input_text, expected_output): Verify that the conversion from text to EDTF format matches the expected output. """ result = text_to_edtf(input_text) - assert ( - result == expected_output - ), f"Failed for input: {input_text} - expected {expected_output}, got {result}" + assert result == expected_output, ( + f"Failed for input: {input_text} - expected {expected_output}, got {result}" + ) @pytest.mark.benchmark diff --git a/edtf/parser/tests.py b/edtf/parser/tests.py index c2dd711..f37c806 100644 --- a/edtf/parser/tests.py +++ b/edtf/parser/tests.py @@ -312,51 +312,51 @@ def test_edtf_examples(test_input, expected_tuple): # Unpack expected results based on their count if len(expected_tuple) == 1: - assert ( - result_date == expected_tuple[0] - ), f"Expected {expected_tuple[0]}, got {result_date}" + assert result_date == expected_tuple[0], ( + f"Expected {expected_tuple[0]}, got {result_date}" + ) elif len(expected_tuple) == 2: lower_strict = iso_to_struct_time(expected_tuple[0]) upper_strict = iso_to_struct_time(expected_tuple[1]) - assert ( - result.lower_strict() == lower_strict - ), f"Lower strict date does not match. Expected {lower_strict}, got {result.lower_strict()}" - assert ( - result.upper_strict() == upper_strict - ), f"Upper strict date does not match. Expected {upper_strict}, got {result.upper_strict()}" + assert result.lower_strict() == lower_strict, ( + f"Lower strict date does not match. Expected {lower_strict}, got {result.lower_strict()}" + ) + assert result.upper_strict() == upper_strict, ( + f"Upper strict date does not match. Expected {upper_strict}, got {result.upper_strict()}" + ) elif len(expected_tuple) == 3: strict_date = iso_to_struct_time(expected_tuple[0]) lower_fuzzy = iso_to_struct_time(expected_tuple[1]) upper_fuzzy = iso_to_struct_time(expected_tuple[2]) - assert ( - result.lower_strict() == strict_date - ), f"Lower strict date does not match. Expected {strict_date}, got {result.lower_strict()}" - assert ( - result.upper_strict() == strict_date - ), f"Upper strict date does not match. Expected {strict_date}, got {result.upper_strict()}" - assert ( - result.lower_fuzzy() == lower_fuzzy - ), f"Lower fuzzy date does not match. Expected {lower_fuzzy}, got {result.lower_fuzzy()}" - assert ( - result.upper_fuzzy() == upper_fuzzy - ), f"Upper fuzzy date does not match. Expected {upper_fuzzy}, got {result.upper_fuzzy()}" + assert result.lower_strict() == strict_date, ( + f"Lower strict date does not match. Expected {strict_date}, got {result.lower_strict()}" + ) + assert result.upper_strict() == strict_date, ( + f"Upper strict date does not match. Expected {strict_date}, got {result.upper_strict()}" + ) + assert result.lower_fuzzy() == lower_fuzzy, ( + f"Lower fuzzy date does not match. Expected {lower_fuzzy}, got {result.lower_fuzzy()}" + ) + assert result.upper_fuzzy() == upper_fuzzy, ( + f"Upper fuzzy date does not match. Expected {upper_fuzzy}, got {result.upper_fuzzy()}" + ) elif len(expected_tuple) == 4: lower_strict = iso_to_struct_time(expected_tuple[0]) upper_strict = iso_to_struct_time(expected_tuple[1]) lower_fuzzy = iso_to_struct_time(expected_tuple[2]) upper_fuzzy = iso_to_struct_time(expected_tuple[3]) - assert ( - result.lower_strict() == lower_strict - ), f"Lower strict date does not match. Expected {lower_strict}, got {result.lower_strict()}" - assert ( - result.upper_strict() == upper_strict - ), f"Upper strict date does not match. Expected {upper_strict}, got {result.upper_strict()}" - assert ( - result.lower_fuzzy() == lower_fuzzy - ), f"Lower fuzzy date does not match. Expected {lower_fuzzy}, got {result.lower_fuzzy()}" - assert ( - result.upper_fuzzy() == upper_fuzzy - ), f"Upper fuzzy date does not match. Expected {upper_fuzzy}, got {result.upper_fuzzy()}" + assert result.lower_strict() == lower_strict, ( + f"Lower strict date does not match. Expected {lower_strict}, got {result.lower_strict()}" + ) + assert result.upper_strict() == upper_strict, ( + f"Upper strict date does not match. Expected {upper_strict}, got {result.upper_strict()}" + ) + assert result.lower_fuzzy() == lower_fuzzy, ( + f"Lower fuzzy date does not match. Expected {lower_fuzzy}, got {result.lower_fuzzy()}" + ) + assert result.upper_fuzzy() == upper_fuzzy, ( + f"Upper fuzzy date does not match. Expected {upper_fuzzy}, got {result.upper_fuzzy()}" + ) @pytest.mark.parametrize("bad_input", BAD_EXAMPLES) From adc1805886df4ad57f4ef0b7c1a441948c889c08 Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Thu, 16 Jan 2025 17:53:27 +0100 Subject: [PATCH 25/40] Bad formatting conversion --- edtf/parser/grammar.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/edtf/parser/grammar.py b/edtf/parser/grammar.py index b11a3c8..2fdb4bf 100644 --- a/edtf/parser/grammar.py +++ b/edtf/parser/grammar.py @@ -49,15 +49,15 @@ Unspecified, ) -oneThru12 = oneOf([f"{i:.2d}" for i in range(1, 13)]) -oneThru13 = oneOf([f"{i:.2d}" for i in range(1, 14)]) -oneThru23 = oneOf([f"{i:.2d}" for i in range(1, 24)]) -zeroThru23 = oneOf([f"{i:.2d}" for i in range(0, 24)]) -oneThru29 = oneOf([f"{i:.2d}" for i in range(1, 30)]) -oneThru30 = oneOf([f"{i:.2d}" for i in range(1, 31)]) -oneThru31 = oneOf([f"{i:.2d}" for i in range(1, 32)]) -oneThru59 = oneOf([f"{i:.2d}" for i in range(1, 60)]) -zeroThru59 = oneOf([f"{i:.2d}" for i in range(0, 60)]) +oneThru12 = oneOf([f"{i:02}" for i in range(1, 13)]) +oneThru13 = oneOf([f"{i:02}" for i in range(1, 14)]) +oneThru23 = oneOf([f"{i:02}" for i in range(1, 24)]) +zeroThru23 = oneOf([f"{i:02}" for i in range(0, 24)]) +oneThru29 = oneOf([f"{i:02}" for i in range(1, 30)]) +oneThru30 = oneOf([f"{i:02}" for i in range(1, 31)]) +oneThru31 = oneOf([f"{i:02}" for i in range(1, 32)]) +oneThru59 = oneOf([f"{i:02}" for i in range(1, 60)]) +zeroThru59 = oneOf([f"{i:02}" for i in range(0, 60)]) digit = Word(nums, exact=1) positiveDigit = Word(nums, exact=1, excludeChars="0") From ffbe2d4e31635565155443b4f902f70eeb153dda Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Fri, 17 Jan 2025 11:06:43 +0100 Subject: [PATCH 26/40] Replace range len with enumerate --- edtf/natlang/en.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/edtf/natlang/en.py b/edtf/natlang/en.py index 62ccca5..82fefc8 100644 --- a/edtf/natlang/en.py +++ b/edtf/natlang/en.py @@ -230,7 +230,7 @@ def text_to_edtf_date(text: str) -> Optional[str]: mentions_month = re.findall(MENTIONS_MONTH, t) mentions_day = re.findall(MENTIONS_DAY, t) - for i in range(len(date1)): + for i, char in enumerate(date1): # if the given year could be a century (e.g. '1800s') then use # approximate/uncertain markers to decide whether we treat it as # a century or a decade. @@ -241,10 +241,10 @@ def text_to_edtf_date(text: str) -> Optional[str]: result += "X" # year precision else: result += "X" # decade precision - elif date1[i] == date2[i]: + elif char == date2[i]: # since both attempts at parsing produced the same result # it must be parsed value, not a default - result += date1[i] + result += char else: # different values were produced, meaning that it's likely # a default. Use 'unspecified' From f7aeddb59f38d4169c8e31a7432baccb03fcaad4 Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Tue, 21 Jan 2025 10:02:55 +0100 Subject: [PATCH 27/40] reinstate lru cache --- edtf/natlang/en.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/edtf/natlang/en.py b/edtf/natlang/en.py index 82fefc8..f287f42 100644 --- a/edtf/natlang/en.py +++ b/edtf/natlang/en.py @@ -1,5 +1,6 @@ """Utilities to derive an EDTF string from an (English) natural language string.""" +import functools import re from datetime import datetime from typing import Optional @@ -51,7 +52,7 @@ REJECT_RULES = re.compile(r".*dynasty.*") # Don't parse '23rd Dynasty' to 'uuuu-uu-23' -# @functools.lru_cache +@functools.lru_cache def text_to_edtf(text: str) -> Optional[str]: """ Generate EDTF string equivalent of a given natural language date string. @@ -132,7 +133,7 @@ def text_to_edtf(text: str) -> Optional[str]: return result -# @functools.lru_cache +@functools.lru_cache def text_to_edtf_date(text: str) -> Optional[str]: """ Return EDTF string equivalent of a given natural language date string. From 4885de55fc157e3a9118900e74b5733fef36e185 Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Mon, 26 May 2025 13:41:33 +0200 Subject: [PATCH 28/40] Updates to typing etc. --- README.md | 2 +- edtf/convert.py | 2 +- edtf/jdutil.py | 4 +- edtf/natlang/en.py | 11 +- edtf/parser/grammar.py | 5 +- edtf/parser/parser_classes.py | 276 +++++++++++++++++----------------- edtf/py.typed | 0 pyproject.toml | 4 +- 8 files changed, 150 insertions(+), 154 deletions(-) create mode 100644 edtf/py.typed diff --git a/README.md b/README.md index 2c1f34c..c28d450 100644 --- a/README.md +++ b/README.md @@ -516,6 +516,6 @@ Since the `EDTFField` and the `_earliest` and `_latest` field values are set aut * Fix formatting: `ruff format --config pyproject.toml` * Linting and formatting checks and attempted fixes are also run as precommit hooks if you installed them. -### Coverage and benchmraks +### Coverage and benchmarks Coverage reports are generated and added as comments to commits, and also visible in the actions log. Benchmarks are run on pull requests and are published [here]( https://ixc.github.io/python-edtf/dev/bench/) and also visible in the actions log. diff --git a/edtf/convert.py b/edtf/convert.py index db86155..c03e2ea 100644 --- a/edtf/convert.py +++ b/edtf/convert.py @@ -21,7 +21,7 @@ def old_specs_to_new_specs_expression(expression): return expression -def dt_to_struct_time(dt): +def dt_to_struct_time(dt) -> struct_time: """ Convert a `datetime.date` or `datetime.datetime` to a `struct_time` representation *with zero values* for data fields that we cannot always diff --git a/edtf/jdutil.py b/edtf/jdutil.py index 7c0a3bd..b7a2cbb 100644 --- a/edtf/jdutil.py +++ b/edtf/jdutil.py @@ -396,7 +396,7 @@ def __sub__(self, other): return jd_to_datetime(combined) - elif isinstance(other, (datetime, dt.datetime)): + elif isinstance(other, datetime | dt.datetime): diff = datetime_to_jd(self) - datetime_to_jd(other) return dt.timedelta(diff) @@ -407,7 +407,7 @@ def __sub__(self, other): raise TypeError(s) def __rsub__(self, other): - if not isinstance(other, (datetime, dt.datetime)): + if not isinstance(other, datetime | dt.datetime): s = "jdutil.datetime supports '-' with: " s += "jdutil.datetime and datetime.datetime" raise TypeError(s) diff --git a/edtf/natlang/en.py b/edtf/natlang/en.py index f287f42..077ae19 100644 --- a/edtf/natlang/en.py +++ b/edtf/natlang/en.py @@ -3,7 +3,6 @@ import functools import re from datetime import datetime -from typing import Optional from dateutil.parser import ParserError, parse @@ -53,7 +52,7 @@ @functools.lru_cache -def text_to_edtf(text: str) -> Optional[str]: +def text_to_edtf(text: str) -> str | None: """ Generate EDTF string equivalent of a given natural language date string. """ @@ -63,7 +62,7 @@ def text_to_edtf(text: str) -> Optional[str]: t = text.lower() # try parsing the whole thing - result: Optional[str] = text_to_edtf_date(t) + result: str | None = text_to_edtf_date(t) if not result: # split by list delims and move fwd with the first thing that returns a non-empty string. @@ -134,7 +133,7 @@ def text_to_edtf(text: str) -> Optional[str]: @functools.lru_cache -def text_to_edtf_date(text: str) -> Optional[str]: +def text_to_edtf_date(text: str) -> str | None: """ Return EDTF string equivalent of a given natural language date string. @@ -215,9 +214,9 @@ def text_to_edtf_date(text: str) -> Optional[str]: ) except ParserError: - return + return None except Exception: - return + return None if dt1.date() == DEFAULT_DATE_1.date() and dt2.date() == DEFAULT_DATE_2.date(): # couldn't parse anything - defaults are untouched. diff --git a/edtf/parser/grammar.py b/edtf/parser/grammar.py index 2fdb4bf..db6e93e 100644 --- a/edtf/parser/grammar.py +++ b/edtf/parser/grammar.py @@ -343,15 +343,16 @@ def f(toks): ) -def parse_edtf(input_string, parseAll=True, fail_silently=False, debug=None): +def parse_edtf(input_string, parse_all=True, fail_silently=False, debug=None): if debug is None: debug = DEBUG_PYPARSING if not input_string: raise EDTFParseException(input_string) try: - p = edtfParser.parseString(input_string.strip(), parseAll) + p = edtfParser.parseString(input_string.strip(), parse_all) if p: return p[0] + return None except ParseException as err: if fail_silently: return None diff --git a/edtf/parser/parser_classes.py b/edtf/parser/parser_classes.py index 9439a80..f4168b9 100644 --- a/edtf/parser/parser_classes.py +++ b/edtf/parser/parser_classes.py @@ -1,6 +1,7 @@ import calendar import math import re +from collections.abc import Callable from datetime import date, datetime from operator import add, sub from time import struct_time @@ -50,7 +51,7 @@ def days_in_month(year: int, month: int) -> int: }[month] -def apply_delta(op, time_struct, delta): +def apply_delta(op: Callable, time_struct: struct_time, delta) -> struct_time: """ Apply a `relativedelta` to a `struct_time` data structure. @@ -76,9 +77,9 @@ def apply_delta(op, time_struct, delta): # Adjust the year to be close to the 2000 millenium in 1,000 year # increments to try and retain accurate relative leap years - actual_year = time_struct.tm_year - millenium = int(float(actual_year) / 1000) - millenium_diff = (2 - millenium) * 1000 + actual_year: int = time_struct.tm_year + millenium: int = int(float(actual_year) / 1000) + millenium_diff: int = (2 - millenium) * 1000 adjusted_year = actual_year + millenium_diff # Apply delta to the date/time with adjusted year dt = datetime(*(adjusted_year,) + time_struct[1:6]) @@ -97,6 +98,9 @@ class EDTFObject: """ parser = None + _is_approximate: bool + _is_uncertain: bool + _uncertain_and_approximate: bool @classmethod def set_parser(cls, p): @@ -116,26 +120,26 @@ def parse_action(cls, toks): def parse(cls, s): return cls.parser.parseString(s)[0] - def __repr__(self): + def __repr__(self) -> str: return f"{type(self).__name__}: '{str(self)}'" def __init__(self, *args, **kwargs): - str = f"{type(self).__name__}.__init__(*{args}, **{kwargs})" - raise NotImplementedError(f"{str} is not implemented.") + message: str = f"{type(self).__name__}.__init__(*{args}, **{kwargs})" + raise NotImplementedError(f"{message} is not implemented.") - def __str__(self): + def __str__(self) -> str: raise NotImplementedError def _strict_date(self, lean: str = EARLIEST): raise NotImplementedError - def lower_strict(self): + def lower_strict(self) -> struct_time: return self._strict_date(lean=EARLIEST) - def upper_strict(self): + def upper_strict(self) -> struct_time: return self._strict_date(lean=LATEST) - def _get_fuzzy_padding(self, lean: str): + def _get_fuzzy_padding(self, lean: str) -> relativedelta: """ Subclasses should override this to pad based on how precise they are. """ @@ -168,15 +172,15 @@ def set_is_uncertain_and_approximate(self, val: bool) -> None: set_is_uncertain_and_approximate, # noqa ) - def lower_fuzzy(self): + def lower_fuzzy(self) -> struct_time: strict_val = self.lower_strict() return apply_delta(sub, strict_val, self._get_fuzzy_padding(EARLIEST)) - def upper_fuzzy(self): + def upper_fuzzy(self) -> struct_time: strict_val = self.upper_strict() return apply_delta(add, strict_val, self._get_fuzzy_padding(LATEST)) - def __eq__(self, other): + def __eq__(self, other) -> bool: if isinstance(other, EDTFObject): return str(self) == str(other) elif isinstance(other, date): @@ -185,7 +189,7 @@ def __eq__(self, other): return self._strict_date() == trim_struct_time(other) return False - def __ne__(self, other): + def __ne__(self, other) -> bool: if isinstance(other, EDTFObject): return str(self) != str(other) elif isinstance(other, date): @@ -194,7 +198,7 @@ def __ne__(self, other): return self._strict_date() != trim_struct_time(other) return True - def __gt__(self, other): + def __gt__(self, other) -> bool: if isinstance(other, EDTFObject): return self.lower_strict() > other.lower_strict() elif isinstance(other, date): @@ -205,7 +209,7 @@ def __gt__(self, other): f"can't compare {type(self).__name__} with {type(other).__name__}" ) - def __ge__(self, other): + def __ge__(self, other) -> bool: if isinstance(other, EDTFObject): return self.lower_strict() >= other.lower_strict() elif isinstance(other, date): @@ -216,7 +220,7 @@ def __ge__(self, other): f"can't compare {type(self).__name__} with {type(other).__name__}" ) - def __lt__(self, other): + def __lt__(self, other) -> bool: if isinstance(other, EDTFObject): return self.lower_strict() < other.lower_strict() elif isinstance(other, date): @@ -227,7 +231,7 @@ def __lt__(self, other): f"can't compare {type(self).__name__} with {type(other).__name__}" ) - def __le__(self, other): + def __le__(self, other) -> bool: if isinstance(other, EDTFObject): return self.lower_strict() <= other.lower_strict() elif isinstance(other, date): @@ -245,9 +249,9 @@ def __le__(self, other): class Date(EDTFObject): def __init__( # noqa self, - year: Optional[str] = None, - month: Optional[str] = None, - day: Optional[str] = None, + year: str | None = None, + month: str | None = None, + day: str | None = None, significant_digits=None, **kwargs, ): @@ -256,10 +260,12 @@ def __init__( # noqa self.__init__(**kwargs[param]) return - self._year = year # Year is required, but sometimes passed in as a 'date' dict. - self._month = month - self._day = day - self.significant_digits = ( + self._year: str | None = ( + year # Year is required, but sometimes passed in as a 'date' dict. + ) + self._month: str | None = month + self._day: str | None = day + self.significant_digits: int | None = ( int(significant_digits) if significant_digits else None ) @@ -268,32 +274,32 @@ def set_year(self, y: str): raise AttributeError("Year must not be None") self._year = y - def get_year(self) -> str: + def get_year(self) -> str | None: return self._year year = property(get_year, set_year) # noqa - def set_month(self, m: Optional[str]): + def set_month(self, m: str | None): self._month = m if m is None: self._day = None - def get_month(self) -> Optional[str]: + def get_month(self) -> str | None: return self._month month = property(get_month, set_month) # noqa - def set_day(self, d: Optional[str]): + def set_day(self, d: str | None): self._day = d if d is None: self._day = None - def get_day(self) -> Optional[str]: + def get_day(self) -> str | None: return self._day day = property(get_day, set_day) # noqa - def __str__(self): + def __str__(self) -> str: r = self._year if self._month: r += f"-{self._month}" @@ -303,47 +309,42 @@ def __str__(self): r += f"S{self.significant_digits}" return r - def isoformat(self, default=date.max): + def isoformat(self, default=date.max) -> str: return f"{self._year}-{int(self._month or default.month):02d}-{int(self._day or default.day):02d}" - def lower_fuzzy(self): + def lower_fuzzy(self) -> struct_time: if not hasattr(self, "significant_digits") or not self.significant_digits: return apply_delta( sub, self.lower_strict(), self._get_fuzzy_padding(EARLIEST) ) - else: - total_digits = len(self._year) - insignificant_digits = total_digits - self.significant_digits - lower_year = ( - int(self._year) - // (10**insignificant_digits) - * (10**insignificant_digits) - ) - return struct_time([lower_year, 1, 1] + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS) - def upper_fuzzy(self): + total_digits = len(self._year) + insignificant_digits = total_digits - self.significant_digits + lower_year = ( + int(self._year) // (10**insignificant_digits) * (10**insignificant_digits) + ) + return struct_time([lower_year, 1, 1] + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS) + + def upper_fuzzy(self) -> struct_time: if not hasattr(self, "significant_digits") or not self.significant_digits: return apply_delta( add, self.upper_strict(), self._get_fuzzy_padding(LATEST) ) - else: - total_digits = len(self._year) - insignificant_digits = total_digits - self.significant_digits - upper_year = (int(self._year) // (10**insignificant_digits) + 1) * ( - 10**insignificant_digits - ) - 1 - return struct_time( - [upper_year, 12, 31] + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS - ) - def _precise_year(self, lean): + total_digits: int = len(self._year) + insignificant_digits: int = total_digits - self.significant_digits + upper_year: int = (int(self._year) // (10**insignificant_digits) + 1) * ( + 10**insignificant_digits + ) - 1 + return struct_time([upper_year, 12, 31] + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS) + + def _precise_year(self, lean: str) -> int: # Replace any ambiguous characters in the year string with 0s or 9s if lean == EARLIEST: return int(re.sub(r"X", r"0", self._year)) - else: - return int(re.sub(r"X", r"9", self._year)) + return int(re.sub(r"X", r"9", self._year)) - def _precise_month(self, lean): + def _precise_month(self, lean: str) -> int: if self._month and self._month != "XX": try: return int(self._month) @@ -351,10 +352,9 @@ def _precise_month(self, lean): raise ValueError( f"Couldn't convert {self._month} to int (in {self})" ) from err - else: - return 1 if lean == EARLIEST else 12 + return 1 if lean == EARLIEST else 12 - def _precise_day(self, lean): + def _precise_day(self, lean: str) -> int: if not self._day or self._day == "XX": if lean == EARLIEST: return 1 @@ -362,10 +362,9 @@ def _precise_day(self, lean): return days_in_month( self._precise_year(LATEST), self._precise_month(LATEST) ) - else: - return int(self._day) + return int(self._day) - def _strict_date(self, lean: str = EARLIEST): + def _strict_date(self, lean: str = EARLIEST) -> struct_time: """ Return a `time.struct_time` representation of the date. """ @@ -380,39 +379,39 @@ def _strict_date(self, lean: str = EARLIEST): ) @property - def precision(self): + def precision(self) -> str: if self._day: return PRECISION_DAY if self._month: return PRECISION_MONTH return PRECISION_YEAR - def estimated(self): + def estimated(self) -> int: return self._precise_year(EARLIEST) class DateAndTime(EDTFObject): - def __init__(self, date, time): # noqa: super raises not implemented - self.date = date + def __init__(self, date: Date, time): # noqa: super raises not implemented + self.date: Date = date self.time = time - def __str__(self): + def __str__(self) -> str: return self.isoformat() - def isoformat(self): + def isoformat(self) -> str: return self.date.isoformat() + "T" + self.time - def _strict_date(self, lean: str = EARLIEST): + def _strict_date(self, lean: str = EARLIEST) -> struct_time: return self.date._strict_date(lean) - def __eq__(self, other): + def __eq__(self, other) -> bool: if isinstance(other, datetime): return self.isoformat() == other.isoformat() elif isinstance(other, struct_time): return self._strict_date() == trim_struct_time(other) return super().__eq__(other) - def __ne__(self, other): + def __ne__(self, other) -> bool: if isinstance(other, datetime): return self.isoformat() != other.isoformat() elif isinstance(other, struct_time): @@ -428,15 +427,13 @@ def __init__(self, lower, upper): # noqa: super() raises not implemented def __str__(self): return f"{self.lower}/{self.upper}" - def _strict_date(self, lean: str = EARLIEST): + def _strict_date(self, lean: str = EARLIEST) -> struct_time: if lean == EARLIEST: - r = self.lower._strict_date(lean) - else: - r = self.upper._strict_date(lean) - return r + return self.lower._strict_date(lean) + return self.upper._strict_date(lean) @property - def precision(self): + def precision(self) -> int | None: if self.lower.precision == self.upper.precision: return self.lower.precision return None @@ -456,27 +453,28 @@ def __init__(self, *args): # noqa: super() raises not implemented raise AssertionError("UA must have exactly one argument") ua = args[0] - self.is_uncertain = "?" in ua - self.is_approximate = "~" in ua - self.is_uncertain_and_approximate = "%" in ua + self.is_uncertain: bool = "?" in ua + self.is_approximate: bool = "~" in ua + self.is_uncertain_and_approximate: bool = "%" in ua - def __str__(self): - d = "" + def __str__(self) -> str: + d: list = [] if self.is_uncertain: - d += "?" + d.append("?") if self.is_approximate: - d += "~" + d.append("~") if self.is_uncertain_and_approximate: - d += "%" - return d + d.append("%") + return "".join(d) - def _get_multiplier(self): + def _get_multiplier(self) -> float | None: if self.is_uncertain_and_approximate: return appsettings.MULTIPLIER_IF_BOTH elif self.is_uncertain: return appsettings.MULTIPLIER_IF_UNCERTAIN elif self.is_approximate: return appsettings.MULTIPLIER_IF_APPROXIMATE + return None class UncertainOrApproximate(EDTFObject): @@ -489,13 +487,12 @@ def __init__(self, date, ua): # noqa: super() raises not implemented ua.is_uncertain_and_approximate if ua else False ) - def __str__(self): + def __str__(self) -> str: if self.ua: return f"{self.date}{self.ua}" - else: - return str(self.date) + return str(self.date) - def _strict_date(self, lean: str = EARLIEST): + def _strict_date(self, lean: str = EARLIEST) -> tuple: return self.date._strict_date(lean) def _get_fuzzy_padding(self, lean): @@ -536,10 +533,9 @@ def __init__(self, sectionOpen=False, other_section_element=None): # noqa: supe def __str__(self): if self.is_unknown: return "" - else: - return ".." + return ".." - def _strict_date(self, lean: str = EARLIEST): + def _strict_date(self, lean: str = EARLIEST) -> float | None: if lean not in (EARLIEST, LATEST): raise ValueError("lean must be one of EARLIEST or LATEST") @@ -555,6 +551,7 @@ def _strict_date(self, lean: str = EARLIEST): return apply_delta(add, lower, appsettings.DELTA_IF_UNKNOWN) else: return math.inf + return None @property def precision(self): @@ -661,10 +658,10 @@ def lower_strict(self): ) else: return strict_val - else: - return self._strict_date(lean=EARLIEST) - def upper_strict(self): + return self._strict_date(lean=EARLIEST) + + def upper_strict(self) -> struct_time: if self.negative: strict_val = self._strict_date(lean=EARLIEST) if self.precision in ( @@ -689,8 +686,7 @@ def upper_strict(self): ) else: return strict_val - else: - return self._strict_date(lean=LATEST) + return self._strict_date(lean=LATEST) @property def precision(self): @@ -744,15 +740,16 @@ def __init__(self, lower: Optional[dict] = None, upper: Optional[dict] = None): or self.upper.is_uncertain_and_approximate ) - def _get_fuzzy_padding(self, lean): + def _get_fuzzy_padding(self, lean) -> relativedelta | None: if lean == EARLIEST: return self.lower._get_fuzzy_padding(lean) elif lean == LATEST: return self.upper._get_fuzzy_padding(lean) + return None class LongYear(EDTFObject): - def __init__(self, year: str, significant_digits: Optional[str] = None): # noqa + def __init__(self, year: str, significant_digits: str | None = None): # noqa self.year = year self.significant_digits = ( int(significant_digits) if significant_digits else None @@ -761,18 +758,16 @@ def __init__(self, year: str, significant_digits: Optional[str] = None): # noqa def __str__(self): if self.significant_digits: return f"Y{self.year}S{self.significant_digits}" - else: - return f"Y{self.year}" + return f"Y{self.year}" def _precise_year(self): return int(self.year) - def _strict_date(self, lean: str = EARLIEST): + def _strict_date(self, lean: str = EARLIEST) -> struct_time: py = self._precise_year() if lean == EARLIEST: return struct_time([py, 1, 1] + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS) - else: - return struct_time([py, 12, 31] + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS) + return struct_time([py, 12, 31] + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS) def estimated(self): return self._precise_year() @@ -782,18 +777,19 @@ def lower_fuzzy(self): strict_val = self.lower_strict() if not self.significant_digits: return apply_delta(sub, strict_val, self._get_fuzzy_padding(EARLIEST)) - else: - insignificant_digits = len(str(full_year)) - int(self.significant_digits) - if insignificant_digits <= 0: - return apply_delta(sub, strict_val, self._get_fuzzy_padding(EARLIEST)) - padding_value = 10**insignificant_digits - sig_digits = full_year // padding_value - lower_year = sig_digits * padding_value - return apply_delta( - sub, - struct_time([lower_year, 1, 1] + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS), - self._get_fuzzy_padding(EARLIEST), - ) + + insignificant_digits = len(str(full_year)) - int(self.significant_digits) + if insignificant_digits <= 0: + return apply_delta(sub, strict_val, self._get_fuzzy_padding(EARLIEST)) + + padding_value = 10**insignificant_digits + sig_digits = full_year // padding_value + lower_year = sig_digits * padding_value + return apply_delta( + sub, + struct_time([lower_year, 1, 1] + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS), + self._get_fuzzy_padding(EARLIEST), + ) def upper_fuzzy(self): full_year = self._precise_year() @@ -822,7 +818,7 @@ def __init__(self, year, season, **kwargs): # noqa # `Date` methods do their thing. self.day = None - def __str__(self): + def __str__(self) -> str: return f"{self.year}-{self.season}" def _precise_month(self, lean): @@ -842,15 +838,15 @@ def __init__( # noqa year=None, month=None, day=None, - year_ua=False, - month_ua=False, - day_ua=False, - year_month_ua=False, - month_day_ua=False, + year_ua: UA | None = None, + month_ua: UA | None = None, + day_ua: UA | None = None, + year_month_ua: UA | None = None, + month_day_ua: UA | None = None, ssn=None, - season_ua=False, - all_ua=False, - year_ua_b=False, + season_ua: UA | None = None, + all_ua: UA | None = None, + year_ua_b: UA | None = None, ): self.year = year self.month = month @@ -890,7 +886,7 @@ def __init__( # noqa if hasattr(item, "is_uncertain_and_approximate") ) - def __str__(self): + def __str__(self) -> str: if self.season_ua: return f"{self.season}{self.season_ua}" @@ -937,12 +933,12 @@ def _precise_month(self, lean: str): return self.season._precise_month(lean) return super()._precise_month(lean) - def _precise_day(self, lean): + def _precise_day(self, lean: str): if self.season: return self.season._precise_day(lean) return super()._precise_day(lean) - def _get_fuzzy_padding(self, lean): + def _get_fuzzy_padding(self, lean: str): """ This is not a perfect interpretation as fuzziness is introduced for redundant uncertainly modifiers e.g. (2006~)~ will get two sets of @@ -1022,7 +1018,7 @@ def __init__(self, lower=None, upper=None): # noqa self.upper = upper def __str__(self): - return "{}..{}".format(self.lower or "", self.upper or "") + return f"{self.lower or ''}..{self.upper or ''}" class EarlierConsecutives(Level1Interval): @@ -1048,7 +1044,7 @@ def __str__(self): repr: str = ", ".join([str(o) for o in self.objects]) return f"[{repr}]" - def _strict_date(self, lean: str = EARLIEST): + def _strict_date(self, lean: str = EARLIEST) -> float: strict_dates = [x._strict_date(lean) for x in self.objects] # Accounting for possible 'inf' and '-inf' values if lean == LATEST: @@ -1082,7 +1078,7 @@ def __str__(self): repr: str = ", ".join([str(o) for o in self.objects]) return f"{{{repr}}}" - def _strict_date(self, lean: str = EARLIEST): + def _strict_date(self, lean: str = EARLIEST) -> float: if lean == LATEST: return max([x._strict_date(lean) for x in self.objects]) return min([x._strict_date(lean) for x in self.objects]) @@ -1094,15 +1090,16 @@ def __init__(self, lower, upper): # noqa # if so take just the first item. This works around what I *think* is a # bug in the grammar that provides us with single-item lists of # `PartialUncertainOrApproximate` items for lower/upper values. - if isinstance(lower, (tuple, list)) and len(lower) == 1: + if isinstance(lower, tuple | list) and len(lower) == 1: self.lower = lower[0] else: self.lower = lower - if isinstance(lower, (tuple, list)) and len(upper) == 1: + if isinstance(lower, tuple | list) and len(upper) == 1: self.upper = upper[0] else: self.upper = upper + self.is_approximate = self.lower.is_approximate or self.upper.is_approximate self.is_uncertain = self.lower.is_uncertain or self.upper.is_uncertain self.is_uncertain_and_approximate = ( @@ -1123,16 +1120,15 @@ def __init__(self, base, exponent, significant_digits=None): # noqa int(significant_digits) if significant_digits else None ) - def _precise_year(self): + def _precise_year(self) -> int: return int(self.base) * 10 ** int(self.exponent) def get_year(self) -> str: if self.significant_digits: return f"{self.base}E{self.exponent}S{self.significant_digits}" - else: - return f"{self.base}E{self.exponent}" + return f"{self.base}E{self.exponent}" year = property(get_year) # noqa - def estimated(self): + def estimated(self) -> int: return self._precise_year() diff --git a/edtf/py.typed b/edtf/py.typed new file mode 100644 index 0000000..e69de29 diff --git a/pyproject.toml b/pyproject.toml index 8826b99..2f098bb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,7 @@ dependencies = [ "pyparsing", ] description = "Python implementation of Library of Congress EDTF (Extended Date Time Format) specification" -requires-python = ">=3.8" +requires-python = ">=3.11" readme = {file = "README.md", content-type = "text/markdown"} authors = [ { name = "The Interaction Consortium", email = "studio@interaction.net.au"}, @@ -111,7 +111,7 @@ exclude_lines = [ [tool.ruff] # Python 3.8 -target-version = "py38" +target-version = "py311" extend-exclude = [ '**/migrations/*', From 98bfe3651f61b572355064b2b9af6a25140bc6c7 Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Mon, 26 May 2025 13:49:58 +0200 Subject: [PATCH 29/40] Update GH actions --- .github/workflows/ci.yml | 2 +- .github/workflows/coverage_readme.yml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4645d13..8f05398 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -18,7 +18,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] + python-version: ["3.11", "3.12", "3.13"] defaults: run: working-directory: . diff --git a/.github/workflows/coverage_readme.yml b/.github/workflows/coverage_readme.yml index 86309de..edb5ac4 100644 --- a/.github/workflows/coverage_readme.yml +++ b/.github/workflows/coverage_readme.yml @@ -22,10 +22,10 @@ jobs: persist-credentials: false fetch-depth: 0 - - name: Set up Python 3.12 + - name: Set up Python 3.13 uses: actions/setup-python@v5 with: - python-version: 3.12 + python-version: 3.13 cache: 'pip' cache-dependency-path: '**/pyproject.toml' From af98f87278a277ad15a3e3fd865924828404924a Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Mon, 26 May 2025 14:24:00 +0200 Subject: [PATCH 30/40] New: Add a validator helper function --- edtf/parser/grammar.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/edtf/parser/grammar.py b/edtf/parser/grammar.py index db6e93e..4f5a526 100644 --- a/edtf/parser/grammar.py +++ b/edtf/parser/grammar.py @@ -359,3 +359,8 @@ def parse_edtf(input_string, parse_all=True, fail_silently=False, debug=None): if debug: raise raise EDTFParseException(input_string, err) from None + + +def validate_edtf(input_string: str) -> bool: + """Returns True if the input string was successfully parsed; False if it isn't.""" + return parse_edtf(input_string, fail_silently=True) is not None From f97b627703cc24b256cae7e8645fa82fdcb53ed1 Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Mon, 26 May 2025 14:24:57 +0200 Subject: [PATCH 31/40] Add validator to init --- edtf/__init__.py | 2 ++ edtf/parser/__init__.py | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/edtf/__init__.py b/edtf/__init__.py index 7bb2885..2265bc1 100644 --- a/edtf/__init__.py +++ b/edtf/__init__.py @@ -23,6 +23,7 @@ Unspecified, UnspecifiedIntervalSection, parse_edtf, + validate_edtf, ) from .convert import ( @@ -46,6 +47,7 @@ "trim_struct_time", "text_to_edtf", "parse_edtf", + "validate_edtf", # parser_exceptions "EDTFParseException", # parser_classes diff --git a/edtf/parser/__init__.py b/edtf/parser/__init__.py index 43197d5..2d2b3b7 100644 --- a/edtf/parser/__init__.py +++ b/edtf/parser/__init__.py @@ -1,5 +1,5 @@ from .edtf_exceptions import EDTFParseException -from .grammar import parse_edtf +from .grammar import parse_edtf, validate_edtf from .parser_classes import ( UA, Consecutives, @@ -26,6 +26,7 @@ __all__ = [ "parse_edtf", + "validate_edtf", "EDTFParseException", "EDTFObject", "Date", From ae82b1191f5e5178627a82b3c26ccded915a3d03 Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Mon, 26 May 2025 15:41:15 +0200 Subject: [PATCH 32/40] Rename validator --- edtf/__init__.py | 4 ++-- edtf/parser/__init__.py | 4 ++-- edtf/parser/grammar.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/edtf/__init__.py b/edtf/__init__.py index 2265bc1..0b0bfbf 100644 --- a/edtf/__init__.py +++ b/edtf/__init__.py @@ -22,8 +22,8 @@ UncertainOrApproximate, Unspecified, UnspecifiedIntervalSection, + is_valid_edtf, parse_edtf, - validate_edtf, ) from .convert import ( @@ -47,7 +47,7 @@ "trim_struct_time", "text_to_edtf", "parse_edtf", - "validate_edtf", + "is_valid_edtf", # parser_exceptions "EDTFParseException", # parser_classes diff --git a/edtf/parser/__init__.py b/edtf/parser/__init__.py index 2d2b3b7..9cbf3c3 100644 --- a/edtf/parser/__init__.py +++ b/edtf/parser/__init__.py @@ -1,5 +1,5 @@ from .edtf_exceptions import EDTFParseException -from .grammar import parse_edtf, validate_edtf +from .grammar import is_valid_edtf, parse_edtf from .parser_classes import ( UA, Consecutives, @@ -26,7 +26,7 @@ __all__ = [ "parse_edtf", - "validate_edtf", + "is_valid_edtf", "EDTFParseException", "EDTFObject", "Date", diff --git a/edtf/parser/grammar.py b/edtf/parser/grammar.py index 4f5a526..7ff3820 100644 --- a/edtf/parser/grammar.py +++ b/edtf/parser/grammar.py @@ -361,6 +361,6 @@ def parse_edtf(input_string, parse_all=True, fail_silently=False, debug=None): raise EDTFParseException(input_string, err) from None -def validate_edtf(input_string: str) -> bool: +def is_valid_edtf(input_string: str) -> bool: """Returns True if the input string was successfully parsed; False if it isn't.""" return parse_edtf(input_string, fail_silently=True) is not None From 86b154619e94559457e921109539a823913bd52b Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Tue, 27 May 2025 10:58:58 +0200 Subject: [PATCH 33/40] Annotate appsettings --- edtf/appsettings.py | 47 +++++++++++++++++++++++++++++---------------- 1 file changed, 30 insertions(+), 17 deletions(-) diff --git a/edtf/appsettings.py b/edtf/appsettings.py index 8e15846..e9b4d9d 100644 --- a/edtf/appsettings.py +++ b/edtf/appsettings.py @@ -12,7 +12,7 @@ except ImportError: EDTF = {} -SEASON_MONTHS_RANGE = EDTF.get( +SEASON_MONTHS_RANGE: dict[int, list[int]] = EDTF.get( "SEASON_MONTHS_RANGE", { # season id: [earliest_month, last_month] @@ -27,7 +27,7 @@ }, ) -SEASON_L2_MONTHS_RANGE = EDTF.get( +SEASON_L2_MONTHS_RANGE: dict[int, list[int]] = EDTF.get( "SEASON_L2_MONTHS_RANGE", { # season id: [earliest_month, last_month] @@ -67,9 +67,9 @@ }, ) -DAY_FIRST = EDTF.get("DAY_FIRST", False) # Americans! +DAY_FIRST: bool = EDTF.get("DAY_FIRST", False) # Americans! -SEASONS = EDTF.get( +SEASONS: dict[int, str] = EDTF.get( "SEASONS", { 21: "spring", @@ -78,25 +78,38 @@ 24: "winter", }, ) -INVERSE_SEASONS = EDTF.get("INVERSE_SEASONS", {v: k for k, v in SEASONS.items()}) +INVERSE_SEASONS: dict[str, int] = EDTF.get( + "INVERSE_SEASONS", {v: k for k, v in SEASONS.items()} +) # also need to interpret `fall` INVERSE_SEASONS["fall"] = 23 # changing these will break tests -PADDING_DAY_PRECISION = EDTF.get("PADDING_DAY_PRECISION", relativedelta(days=1)) -PADDING_MONTH_PRECISION = EDTF.get("PADDING_MONTH_PRECISION", relativedelta(months=1)) -PADDING_YEAR_PRECISION = EDTF.get("PADDING_YEAR_PRECISION", relativedelta(years=1)) -PADDING_SEASON_PRECISION = EDTF.get("PADDING_SEASON_PRECISION", relativedelta(weeks=12)) -PADDING_DECADE_PRECISION = EDTF.get("PADDING_DECADE_PRECISION", relativedelta(years=10)) -PADDING_CENTURY_PRECISION = EDTF.get( +PADDING_DAY_PRECISION: relativedelta = EDTF.get( + "PADDING_DAY_PRECISION", relativedelta(days=1) +) +PADDING_MONTH_PRECISION: relativedelta = EDTF.get( + "PADDING_MONTH_PRECISION", relativedelta(months=1) +) +PADDING_YEAR_PRECISION: relativedelta = EDTF.get( + "PADDING_YEAR_PRECISION", relativedelta(years=1) +) +PADDING_SEASON_PRECISION: relativedelta = EDTF.get( + "PADDING_SEASON_PRECISION", relativedelta(weeks=12) +) +PADDING_DECADE_PRECISION: relativedelta = EDTF.get( + "PADDING_DECADE_PRECISION", relativedelta(years=10) +) +PADDING_CENTURY_PRECISION: relativedelta = EDTF.get( "PADDING_CENTURY_PRECISION", relativedelta(years=100) ) -PADDING_MILLENNIUM_PRECISION = EDTF.get( +PADDING_MILLENNIUM_PRECISION: relativedelta = EDTF.get( "PADDING_MILLENNIUM_PRECISION", relativedelta(years=1000) ) -MULTIPLIER_IF_UNCERTAIN = EDTF.get("MULTIPLIER_IF_UNCERTAIN", 1.0) -MULTIPLIER_IF_APPROXIMATE = EDTF.get("MULTIPLIER_IF_APPROXIMATE", 1.0) -MULTIPLIER_IF_BOTH = EDTF.get("MULTIPLIER_IF_BOTH", 2.0) -DELTA_IF_UNKNOWN = EDTF.get("DELTA_IF_UNKNOWN", relativedelta(years=10)) +MULTIPLIER_IF_UNCERTAIN: float = EDTF.get("MULTIPLIER_IF_UNCERTAIN", 1.0) +MULTIPLIER_IF_APPROXIMATE: float = EDTF.get("MULTIPLIER_IF_APPROXIMATE", 1.0) +MULTIPLIER_IF_BOTH: float = EDTF.get("MULTIPLIER_IF_BOTH", 2.0) +DELTA_IF_UNKNOWN: relativedelta = EDTF.get("DELTA_IF_UNKNOWN", relativedelta(years=10)) +DELTA_IF_EMPTY: relativedelta = relativedelta(None) -DEBUG_PYPARSING = False +DEBUG_PYPARSING: bool = False From a771ec2ff14e8d5e2d40ed8fee0627f8e0aa58df Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Tue, 27 May 2025 10:59:20 +0200 Subject: [PATCH 34/40] parseString is an alias to parse_string --- edtf/parser/grammar.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/edtf/parser/grammar.py b/edtf/parser/grammar.py index 7ff3820..de84633 100644 --- a/edtf/parser/grammar.py +++ b/edtf/parser/grammar.py @@ -343,13 +343,20 @@ def f(toks): ) -def parse_edtf(input_string, parse_all=True, fail_silently=False, debug=None): +def parse_edtf( + input_string: str, + parse_all: bool = True, + fail_silently: bool = False, + debug: bool | None = None, +): if debug is None: debug = DEBUG_PYPARSING + if not input_string: raise EDTFParseException(input_string) + try: - p = edtfParser.parseString(input_string.strip(), parse_all) + p = edtfParser.parse_string(input_string.strip(), parse_all) if p: return p[0] return None From 24a5f607095acf81e131fcbaba71cff5fd60043b Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Tue, 27 May 2025 11:00:12 +0200 Subject: [PATCH 35/40] More fixes for correctness Also removed the one regex and replaced it with the "replace" method on strings. --- edtf/parser/parser_classes.py | 53 ++++++++++++++++++++--------------- 1 file changed, 30 insertions(+), 23 deletions(-) diff --git a/edtf/parser/parser_classes.py b/edtf/parser/parser_classes.py index f4168b9..cfb4459 100644 --- a/edtf/parser/parser_classes.py +++ b/edtf/parser/parser_classes.py @@ -1,6 +1,5 @@ import calendar import math -import re from collections.abc import Callable from datetime import date, datetime from operator import add, sub @@ -123,7 +122,7 @@ def parse(cls, s): def __repr__(self) -> str: return f"{type(self).__name__}: '{str(self)}'" - def __init__(self, *args, **kwargs): + def __init__(self, *args, **kwargs) -> None: message: str = f"{type(self).__name__}.__init__(*{args}, **{kwargs})" raise NotImplementedError(f"{message} is not implemented.") @@ -143,7 +142,7 @@ def _get_fuzzy_padding(self, lean: str) -> relativedelta: """ Subclasses should override this to pad based on how precise they are. """ - return relativedelta(0) + return relativedelta(None) def get_is_approximate(self) -> bool: return getattr(self, "_is_approximate", False) @@ -269,7 +268,7 @@ def __init__( # noqa int(significant_digits) if significant_digits else None ) - def set_year(self, y: str): + def set_year(self, y: str | None): if y is None: raise AttributeError("Year must not be None") self._year = y @@ -300,10 +299,10 @@ def get_day(self) -> str | None: day = property(get_day, set_day) # noqa def __str__(self) -> str: - r = self._year - if self._month: + r = f"{self._year}" + if self._month is not None: r += f"-{self._month}" - if self._day: + if self._day is not None: r += f"-{self._day}" if self.significant_digits: r += f"S{self.significant_digits}" @@ -318,10 +317,11 @@ def lower_fuzzy(self) -> struct_time: sub, self.lower_strict(), self._get_fuzzy_padding(EARLIEST) ) - total_digits = len(self._year) - insignificant_digits = total_digits - self.significant_digits - lower_year = ( - int(self._year) // (10**insignificant_digits) * (10**insignificant_digits) + total_digits: int = len(self._year) if self._year else 0 + i_year: int = int(self._year) if self._year else 0 + insignificant_digits: int = total_digits - self.significant_digits + lower_year: int = ( + i_year // (10**insignificant_digits) * (10**insignificant_digits) ) return struct_time([lower_year, 1, 1] + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS) @@ -331,18 +331,25 @@ def upper_fuzzy(self) -> struct_time: add, self.upper_strict(), self._get_fuzzy_padding(LATEST) ) - total_digits: int = len(self._year) + total_digits: int = len(self._year) if self._year else 0 + i_year: int = int(self._year) if self._year else 0 insignificant_digits: int = total_digits - self.significant_digits - upper_year: int = (int(self._year) // (10**insignificant_digits) + 1) * ( + upper_year: int = (i_year // (10**insignificant_digits) + 1) * ( 10**insignificant_digits ) - 1 return struct_time([upper_year, 12, 31] + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS) def _precise_year(self, lean: str) -> int: # Replace any ambiguous characters in the year string with 0s or 9s + if not self._year: + return 0 + if lean == EARLIEST: - return int(re.sub(r"X", r"0", self._year)) - return int(re.sub(r"X", r"9", self._year)) + rep = self._year.replace("X", "0") + else: + rep = self._year.replace("X", "9") + + return int(rep) def _precise_month(self, lean: str) -> int: if self._month and self._month != "XX": @@ -448,7 +455,7 @@ def parse_action(cls, toks): args = toks.asList() return cls(*args) - def __init__(self, *args): # noqa: super() raises not implemented + def __init__(self, *args) -> None: # noqa: super() raises not implemented if len(args) != 1: raise AssertionError("UA must have exactly one argument") ua = args[0] @@ -944,7 +951,7 @@ def _get_fuzzy_padding(self, lean: str): redundant uncertainly modifiers e.g. (2006~)~ will get two sets of fuzziness. """ - result = relativedelta(0) + result = relativedelta(None) if self.year_ua: result += ( @@ -1040,9 +1047,9 @@ def parse_action(cls, toks): args = [t for t in toks.asList() if isinstance(t, EDTFObject)] return cls(*args) - def __str__(self): - repr: str = ", ".join([str(o) for o in self.objects]) - return f"[{repr}]" + def __str__(self) -> str: + out: str = ", ".join([str(o) for o in self.objects]) + return f"[{out}]" def _strict_date(self, lean: str = EARLIEST) -> float: strict_dates = [x._strict_date(lean) for x in self.objects] @@ -1074,9 +1081,9 @@ def parse_action(cls, toks): args = [t for t in toks.asList() if isinstance(t, EDTFObject)] return cls(*args) - def __str__(self): - repr: str = ", ".join([str(o) for o in self.objects]) - return f"{{{repr}}}" + def __str__(self) -> str: + out: str = ", ".join([str(o) for o in self.objects]) + return f"{{{out}}}" def _strict_date(self, lean: str = EARLIEST) -> float: if lean == LATEST: From df15fd8d4c4b976f103144e9a91ffe69d19609f9 Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Mon, 2 Jun 2025 11:57:22 +0200 Subject: [PATCH 36/40] Try 3.10 --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8f05398..dad48aa 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -18,7 +18,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.11", "3.12", "3.13"] + python-version: ["3.10", "3.11", "3.12", "3.13"] defaults: run: working-directory: . From 6a91fa04aecd5ff5e10833edf77c712179bc628f Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Mon, 2 Jun 2025 11:57:34 +0200 Subject: [PATCH 37/40] More type annotations --- edtf/parser/parser_classes.py | 36 +++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/edtf/parser/parser_classes.py b/edtf/parser/parser_classes.py index cfb4459..0047aec 100644 --- a/edtf/parser/parser_classes.py +++ b/edtf/parser/parser_classes.py @@ -757,17 +757,17 @@ def _get_fuzzy_padding(self, lean) -> relativedelta | None: class LongYear(EDTFObject): def __init__(self, year: str, significant_digits: str | None = None): # noqa - self.year = year - self.significant_digits = ( + self.year: str = year + self.significant_digits: int | None = ( int(significant_digits) if significant_digits else None ) - def __str__(self): + def __str__(self) -> str: if self.significant_digits: return f"Y{self.year}S{self.significant_digits}" return f"Y{self.year}" - def _precise_year(self): + def _precise_year(self) -> int: return int(self.year) def _strict_date(self, lean: str = EARLIEST) -> struct_time: @@ -776,10 +776,10 @@ def _strict_date(self, lean: str = EARLIEST) -> struct_time: return struct_time([py, 1, 1] + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS) return struct_time([py, 12, 31] + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS) - def estimated(self): + def estimated(self) -> int: return self._precise_year() - def lower_fuzzy(self): + def lower_fuzzy(self) -> struct_time: full_year = self._precise_year() strict_val = self.lower_strict() if not self.significant_digits: @@ -798,7 +798,7 @@ def lower_fuzzy(self): self._get_fuzzy_padding(EARLIEST), ) - def upper_fuzzy(self): + def upper_fuzzy(self) -> struct_time: full_year = self._precise_year() strict_val = self.upper_strict() if not self.significant_digits: @@ -828,7 +828,7 @@ def __init__(self, year, season, **kwargs): # noqa def __str__(self) -> str: return f"{self.year}-{self.season}" - def _precise_month(self, lean): + def _precise_month(self, lean: str) -> int: rng = appsettings.SEASON_L2_MONTHS_RANGE[int(self.season)] if lean == EARLIEST: return rng[0] @@ -881,13 +881,13 @@ def __init__( # noqa season_ua, all_ua, ] - self.is_uncertain = any( + self.is_uncertain: bool = any( item.is_uncertain for item in uas if hasattr(item, "is_uncertain") ) - self.is_approximate = any( + self.is_approximate: bool = any( item.is_approximate for item in uas if hasattr(item, "is_approximate") ) - self.is_uncertain_and_approximate = any( + self.is_uncertain_and_approximate: bool = any( item.is_uncertain_and_approximate for item in uas if hasattr(item, "is_uncertain_and_approximate") @@ -930,22 +930,22 @@ def set_year(self, y): # Year can be None. year = property(Date.get_year, set_year) # noqa - def _precise_year(self, lean: str): + def _precise_year(self, lean: str) -> int: if self.season: return self.season._precise_year(lean) return super()._precise_year(lean) - def _precise_month(self, lean: str): + def _precise_month(self, lean: str) -> int: if self.season: return self.season._precise_month(lean) return super()._precise_month(lean) - def _precise_day(self, lean: str): + def _precise_day(self, lean: str) -> int: if self.season: return self.season._precise_day(lean) return super()._precise_day(lean) - def _get_fuzzy_padding(self, lean: str): + def _get_fuzzy_padding(self, lean: str) -> struct_time: """ This is not a perfect interpretation as fuzziness is introduced for redundant uncertainly modifiers e.g. (2006~)~ will get two sets of @@ -1024,17 +1024,17 @@ def __init__(self, lower=None, upper=None): # noqa else: self.upper = upper - def __str__(self): + def __str__(self) -> str: return f"{self.lower or ''}..{self.upper or ''}" class EarlierConsecutives(Level1Interval): - def __str__(self): + def __str__(self) -> str: return f"{self.lower}{self.upper}" class LaterConsecutives(Level1Interval): - def __str__(self): + def __str__(self) -> str: return f"{self.lower}{self.upper}" From d3d0cd59fc2d85e17d8c9c64a8392f68e733e344 Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Mon, 2 Jun 2025 11:59:06 +0200 Subject: [PATCH 38/40] Update supported python in pyproject --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 2f098bb..1980551 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,7 @@ dependencies = [ "pyparsing", ] description = "Python implementation of Library of Congress EDTF (Extended Date Time Format) specification" -requires-python = ">=3.11" +requires-python = ">=3.10" readme = {file = "README.md", content-type = "text/markdown"} authors = [ { name = "The Interaction Consortium", email = "studio@interaction.net.au"}, From 9bd142d4e9ebde4cd771e5ae1b9351abc148b705 Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Mon, 2 Jun 2025 12:09:12 +0200 Subject: [PATCH 39/40] Fixed: UA is a single state, no need for append --- edtf/parser/parser_classes.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/edtf/parser/parser_classes.py b/edtf/parser/parser_classes.py index 0047aec..aa8144a 100644 --- a/edtf/parser/parser_classes.py +++ b/edtf/parser/parser_classes.py @@ -465,14 +465,13 @@ def __init__(self, *args) -> None: # noqa: super() raises not implemented self.is_uncertain_and_approximate: bool = "%" in ua def __str__(self) -> str: - d: list = [] if self.is_uncertain: - d.append("?") - if self.is_approximate: - d.append("~") - if self.is_uncertain_and_approximate: - d.append("%") - return "".join(d) + return "?" + elif self.is_approximate: + return "~" + elif self.is_uncertain_and_approximate: + return "%" + return "" def _get_multiplier(self) -> float | None: if self.is_uncertain_and_approximate: From 517ba18d3c1b41d97a6ba2b5221d84447205e75b Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Mon, 2 Jun 2025 12:14:02 +0200 Subject: [PATCH 40/40] Add mypy and pip to test dependencies --- pyproject.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 1980551..30a1a9a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,6 +49,8 @@ test = [ "coverage", "pytest-cov", "junitparser", + "mypy>=1.15.0", + "pip>=25.1.1", ] [project.urls]