mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-05-07 15:15:24 +00:00
Fix: Rewrite Whoosh year only queries to be to Tantivy date syntax (#12725)
This commit is contained in:
@@ -71,6 +71,10 @@ _WHOOSH_REL_RANGE_RE = regex.compile(
|
||||
)
|
||||
# Whoosh-style 8-digit date: field:YYYYMMDD — field-aware so timezone can be applied correctly
|
||||
_DATE8_RE = regex.compile(r"(?P<field>\w+):(?P<date8>\d{8})\b")
|
||||
_YEAR_RANGE_RE = regex.compile(
|
||||
r"(?P<field>\w+):\[(?P<y1>\d{4})\s+TO\s+(?P<y2>\d{4})\]",
|
||||
regex.IGNORECASE,
|
||||
)
|
||||
_SIMPLE_QUERY_TOKEN_RE = regex.compile(r"\S+")
|
||||
|
||||
|
||||
@@ -336,6 +340,26 @@ def _rewrite_8digit_date(query: str, tz: tzinfo) -> str:
|
||||
)
|
||||
|
||||
|
||||
def _rewrite_year_range(query: str) -> str:
|
||||
"""Rewrite Whoosh-style year-only date ranges to ISO 8601 UTC boundaries.
|
||||
|
||||
Converts ``field:[YYYY TO YYYY]`` to a full ISO 8601 datetime range.
|
||||
The upper bound is the start of the year after the end year (exclusive),
|
||||
matching the Whoosh convention of treating year-only ranges as full-year spans.
|
||||
"""
|
||||
|
||||
def _sub(m: regex.Match[str]) -> str:
|
||||
field = m.group("field")
|
||||
lo = datetime(int(m.group("y1")), 1, 1, tzinfo=UTC)
|
||||
hi = datetime(int(m.group("y2")) + 1, 1, 1, tzinfo=UTC)
|
||||
return f"{field}:[{_fmt(lo)} TO {_fmt(hi)}]"
|
||||
|
||||
try:
|
||||
return _YEAR_RANGE_RE.sub(_sub, query, timeout=_REGEX_TIMEOUT)
|
||||
except TimeoutError: # pragma: no cover
|
||||
raise ValueError("Query too complex to process (year range rewrite timed out)")
|
||||
|
||||
|
||||
def rewrite_natural_date_keywords(query: str, tz: tzinfo) -> str:
|
||||
"""
|
||||
Rewrite natural date syntax to ISO 8601 format for Tantivy compatibility.
|
||||
@@ -359,6 +383,7 @@ def rewrite_natural_date_keywords(query: str, tz: tzinfo) -> str:
|
||||
"""
|
||||
query = _rewrite_compact_date(query)
|
||||
query = _rewrite_whoosh_relative_range(query)
|
||||
query = _rewrite_year_range(query)
|
||||
query = _rewrite_8digit_date(query, tz)
|
||||
query = _rewrite_relative_range(query)
|
||||
|
||||
|
||||
@@ -444,6 +444,83 @@ class TestParseUserQuery:
|
||||
assert isinstance(q, tantivy.Query)
|
||||
|
||||
|
||||
class TestYearRangeRewriting:
|
||||
"""Whoosh-style year-only date ranges must be rewritten to ISO 8601."""
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("query", "field", "expected_lo", "expected_hi"),
|
||||
[
|
||||
pytest.param(
|
||||
"created:[2020 TO 2020]",
|
||||
"created",
|
||||
"2020-01-01T00:00:00Z",
|
||||
"2021-01-01T00:00:00Z",
|
||||
id="single_year_created",
|
||||
),
|
||||
pytest.param(
|
||||
"created:[2018 TO 2021]",
|
||||
"created",
|
||||
"2018-01-01T00:00:00Z",
|
||||
"2022-01-01T00:00:00Z",
|
||||
id="multi_year_range_created",
|
||||
),
|
||||
pytest.param(
|
||||
"added:[2022 TO 2023]",
|
||||
"added",
|
||||
"2022-01-01T00:00:00Z",
|
||||
"2024-01-01T00:00:00Z",
|
||||
id="added_field",
|
||||
),
|
||||
pytest.param(
|
||||
"modified:[2021 TO 2021]",
|
||||
"modified",
|
||||
"2021-01-01T00:00:00Z",
|
||||
"2022-01-01T00:00:00Z",
|
||||
id="modified_field",
|
||||
),
|
||||
pytest.param(
|
||||
"created:[2020 to 2020]",
|
||||
"created",
|
||||
"2020-01-01T00:00:00Z",
|
||||
"2021-01-01T00:00:00Z",
|
||||
id="lowercase_to_keyword",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_year_range_rewritten(
|
||||
self,
|
||||
query: str,
|
||||
field: str,
|
||||
expected_lo: str,
|
||||
expected_hi: str,
|
||||
) -> None:
|
||||
result = rewrite_natural_date_keywords(query, UTC)
|
||||
lo, hi = _range(result, field)
|
||||
assert lo == expected_lo
|
||||
assert hi == expected_hi
|
||||
|
||||
def test_year_range_in_complex_boolean_query(self) -> None:
|
||||
query = "tag:steuer AND (title:2020 OR (NOT title:2019 AND NOT title:2018 AND created:[2020 TO 2020]))"
|
||||
result = rewrite_natural_date_keywords(query, UTC)
|
||||
lo, hi = _range(result, "created")
|
||||
assert lo == "2020-01-01T00:00:00Z"
|
||||
assert hi == "2021-01-01T00:00:00Z"
|
||||
assert "title:2020" in result
|
||||
assert "title:2019" in result
|
||||
assert "title:2018" in result
|
||||
|
||||
def test_already_iso_date_range_passes_through_unchanged(self) -> None:
|
||||
original = "created:[2020-01-01T00:00:00Z TO 2021-01-01T00:00:00Z]"
|
||||
assert rewrite_natural_date_keywords(original, UTC) == original
|
||||
|
||||
def test_8digit_in_brackets_not_matched_as_year_range(self) -> None:
|
||||
# [YYYYMMDD TO YYYYMMDD] has 8-digit values - must not be caught by year rewriter
|
||||
original = "created:[20200101 TO 20201231]"
|
||||
result = rewrite_natural_date_keywords(original, UTC)
|
||||
assert "20200101" in result or "2020-01-01" in result
|
||||
assert "20201231" in result or "2020-12-31" in result
|
||||
|
||||
|
||||
class TestPassthrough:
|
||||
"""Queries without field prefixes or unrelated content pass through unchanged."""
|
||||
|
||||
@@ -474,7 +551,7 @@ class TestNormalizeQuery:
|
||||
|
||||
class TestPermissionFilter:
|
||||
"""
|
||||
build_permission_filter tests use an in-memory index — no DB access needed.
|
||||
build_permission_filter tests use an in-memory index - no DB access needed.
|
||||
|
||||
Users are constructed as unsaved model instances (django_user_model(pk=N))
|
||||
so no database round-trip occurs; only .pk is read by build_permission_filter.
|
||||
|
||||
Reference in New Issue
Block a user