diff --git a/src/documents/search/_query.py b/src/documents/search/_query.py index 4fd2bfc68..dbd1f8ec2 100644 --- a/src/documents/search/_query.py +++ b/src/documents/search/_query.py @@ -52,7 +52,7 @@ _DATE_KEYWORD_PATTERN = "|".join( ) _FIELD_DATE_RE = regex.compile( - rf"""(?P\w+)\s*:\s*(?: + rf"""(?created|modified|added)\s*:\s*(?: (?P["'])(?P{_DATE_KEYWORD_PATTERN})(?P=quote) | (?P{_DATE_KEYWORD_PATTERN})(?![\w-]) @@ -69,10 +69,13 @@ _WHOOSH_REL_RANGE_RE = regex.compile( r"\[-(?P\d+)\s+(?Psecond|minute|hour|day|week|month|year)s?\s+to\s+now\]", regex.IGNORECASE, ) -# Whoosh-style 8-digit date: field:YYYYMMDD — field-aware so timezone can be applied correctly -_DATE8_RE = regex.compile(r"(?P\w+):(?P\d{8})\b") +# Whoosh-style 8-digit date: field:YYYYMMDD — field-aware so timezone can be applied correctly. +# Scoped to date fields only; numeric fields (asn, id, page_count, ...) must not be rewritten. +_DATE8_RE = regex.compile( + r"(?created|modified|added):(?P\d{8})\b", +) _YEAR_RANGE_RE = regex.compile( - r"(?P\w+):\[(?P\d{4})\s+TO\s+(?P\d{4})\]", + r"(?created|modified|added):\[(?P\d{4})\s+TO\s+(?P\d{4})\]", regex.IGNORECASE, ) _SIMPLE_QUERY_TOKEN_RE = regex.compile(r"\S+") diff --git a/src/documents/tests/search/test_query.py b/src/documents/tests/search/test_query.py index dfeffc253..9c52562d3 100644 --- a/src/documents/tests/search/test_query.py +++ b/src/documents/tests/search/test_query.py @@ -541,6 +541,45 @@ class TestYearRangeRewriting: assert "20201231" in result or "2020-12-31" in result +class TestNonDateFieldsNotRewritten: + """Date rewriters must only fire on the date fields (created/modified/added). + + Integer fields like asn/id/page_count and unknown fields would otherwise be + rewritten into date ranges and rejected by Tantivy as type mismatches. + """ + + @pytest.mark.parametrize( + "query", + [ + pytest.param("asn:20240101", id="asn_8digit"), + pytest.param("id:20240101", id="id_8digit"), + pytest.param("page_count:12345678", id="page_count_8digit"), + pytest.param("num_notes:20231201", id="num_notes_8digit"), + ], + ) + def test_8digit_on_integer_field_passes_through_unchanged(self, query: str) -> None: + assert rewrite_natural_date_keywords(query, EASTERN) == query + + @pytest.mark.parametrize( + "query", + [ + pytest.param("asn:[2000 TO 2024]", id="asn_year_range"), + pytest.param("id:[2000 TO 2024]", id="id_year_range"), + pytest.param("page_count:[2000 TO 2024]", id="page_count_year_range"), + ], + ) + def test_year_range_on_integer_field_passes_through_unchanged( + self, + query: str, + ) -> None: + assert rewrite_natural_date_keywords(query, UTC) == query + + def test_unknown_field_keyword_passes_through_unchanged(self) -> None: + # foobar is not a date field: 'foobar:today' must not become a date range, + # which Tantivy would otherwise reject as an unknown/typed field. + assert rewrite_natural_date_keywords("foobar:today", UTC) == "foobar:today" + + class TestPassthrough: """Queries without field prefixes or unrelated content pass through unchanged."""