From 2296d7fa0ec55cf35e28d177929ec2830f29320c Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Wed, 6 May 2026 09:26:46 -0700 Subject: [PATCH] Fix: Rewrite Whoosh year only queries to be to Tantivy date syntax (#12725) --- src/documents/search/_query.py | 25 ++++++++ src/documents/tests/search/test_query.py | 79 +++++++++++++++++++++++- 2 files changed, 103 insertions(+), 1 deletion(-) diff --git a/src/documents/search/_query.py b/src/documents/search/_query.py index 0fbe4603b..65df260c3 100644 --- a/src/documents/search/_query.py +++ b/src/documents/search/_query.py @@ -71,6 +71,10 @@ _WHOOSH_REL_RANGE_RE = regex.compile( ) # Whoosh-style 8-digit date: field:YYYYMMDD — field-aware so timezone can be applied correctly _DATE8_RE = regex.compile(r"(?P\w+):(?P\d{8})\b") +_YEAR_RANGE_RE = regex.compile( + r"(?P\w+):\[(?P\d{4})\s+TO\s+(?P\d{4})\]", + regex.IGNORECASE, +) _SIMPLE_QUERY_TOKEN_RE = regex.compile(r"\S+") @@ -336,6 +340,26 @@ def _rewrite_8digit_date(query: str, tz: tzinfo) -> str: ) +def _rewrite_year_range(query: str) -> str: + """Rewrite Whoosh-style year-only date ranges to ISO 8601 UTC boundaries. + + Converts ``field:[YYYY TO YYYY]`` to a full ISO 8601 datetime range. + The upper bound is the start of the year after the end year (exclusive), + matching the Whoosh convention of treating year-only ranges as full-year spans. + """ + + def _sub(m: regex.Match[str]) -> str: + field = m.group("field") + lo = datetime(int(m.group("y1")), 1, 1, tzinfo=UTC) + hi = datetime(int(m.group("y2")) + 1, 1, 1, tzinfo=UTC) + return f"{field}:[{_fmt(lo)} TO {_fmt(hi)}]" + + try: + return _YEAR_RANGE_RE.sub(_sub, query, timeout=_REGEX_TIMEOUT) + except TimeoutError: # pragma: no cover + raise ValueError("Query too complex to process (year range rewrite timed out)") + + def rewrite_natural_date_keywords(query: str, tz: tzinfo) -> str: """ Rewrite natural date syntax to ISO 8601 format for Tantivy compatibility. @@ -359,6 +383,7 @@ def rewrite_natural_date_keywords(query: str, tz: tzinfo) -> str: """ query = _rewrite_compact_date(query) query = _rewrite_whoosh_relative_range(query) + query = _rewrite_year_range(query) query = _rewrite_8digit_date(query, tz) query = _rewrite_relative_range(query) diff --git a/src/documents/tests/search/test_query.py b/src/documents/tests/search/test_query.py index e47d6b7df..11297eb92 100644 --- a/src/documents/tests/search/test_query.py +++ b/src/documents/tests/search/test_query.py @@ -444,6 +444,83 @@ class TestParseUserQuery: assert isinstance(q, tantivy.Query) +class TestYearRangeRewriting: + """Whoosh-style year-only date ranges must be rewritten to ISO 8601.""" + + @pytest.mark.parametrize( + ("query", "field", "expected_lo", "expected_hi"), + [ + pytest.param( + "created:[2020 TO 2020]", + "created", + "2020-01-01T00:00:00Z", + "2021-01-01T00:00:00Z", + id="single_year_created", + ), + pytest.param( + "created:[2018 TO 2021]", + "created", + "2018-01-01T00:00:00Z", + "2022-01-01T00:00:00Z", + id="multi_year_range_created", + ), + pytest.param( + "added:[2022 TO 2023]", + "added", + "2022-01-01T00:00:00Z", + "2024-01-01T00:00:00Z", + id="added_field", + ), + pytest.param( + "modified:[2021 TO 2021]", + "modified", + "2021-01-01T00:00:00Z", + "2022-01-01T00:00:00Z", + id="modified_field", + ), + pytest.param( + "created:[2020 to 2020]", + "created", + "2020-01-01T00:00:00Z", + "2021-01-01T00:00:00Z", + id="lowercase_to_keyword", + ), + ], + ) + def test_year_range_rewritten( + self, + query: str, + field: str, + expected_lo: str, + expected_hi: str, + ) -> None: + result = rewrite_natural_date_keywords(query, UTC) + lo, hi = _range(result, field) + assert lo == expected_lo + assert hi == expected_hi + + def test_year_range_in_complex_boolean_query(self) -> None: + query = "tag:steuer AND (title:2020 OR (NOT title:2019 AND NOT title:2018 AND created:[2020 TO 2020]))" + result = rewrite_natural_date_keywords(query, UTC) + lo, hi = _range(result, "created") + assert lo == "2020-01-01T00:00:00Z" + assert hi == "2021-01-01T00:00:00Z" + assert "title:2020" in result + assert "title:2019" in result + assert "title:2018" in result + + def test_already_iso_date_range_passes_through_unchanged(self) -> None: + original = "created:[2020-01-01T00:00:00Z TO 2021-01-01T00:00:00Z]" + assert rewrite_natural_date_keywords(original, UTC) == original + + def test_8digit_in_brackets_not_matched_as_year_range(self) -> None: + # [YYYYMMDD TO YYYYMMDD] has 8-digit values - must not be caught by year rewriter + original = "created:[20200101 TO 20201231]" + result = rewrite_natural_date_keywords(original, UTC) + assert "20200101" in result or "2020-01-01" in result + assert "20201231" in result or "2020-12-31" in result + + class TestPassthrough: """Queries without field prefixes or unrelated content pass through unchanged.""" @@ -474,7 +551,7 @@ class TestNormalizeQuery: class TestPermissionFilter: """ - build_permission_filter tests use an in-memory index — no DB access needed. + build_permission_filter tests use an in-memory index - no DB access needed. Users are constructed as unsaved model instances (django_user_model(pk=N)) so no database round-trip occurs; only .pk is read by build_permission_filter.