From f4fa9165791b0bb0a2858b6978e4a9db565f8e72 Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Mon, 15 Jun 2026 15:32:44 -0700 Subject: [PATCH] Fix (beta): restore v2 (Whoosh) advanced-search query compatibility (#13010) Co-authored-by: Claude Sonnet 4.6 --- src/documents/search/__init__.py | 4 + src/documents/search/_dates.py | 163 ++++ src/documents/search/_query.py | 432 +---------- src/documents/search/_translate.py | 566 ++++++++++++++ src/documents/tests/search/conftest.py | 12 + src/documents/tests/search/test_query.py | 98 ++- src/documents/tests/search/test_translate.py | 742 +++++++++++++++++++ src/documents/tests/test_api_search.py | 9 +- src/documents/views.py | 6 + 9 files changed, 1614 insertions(+), 418 deletions(-) create mode 100644 src/documents/search/_dates.py create mode 100644 src/documents/search/_translate.py create mode 100644 src/documents/tests/search/test_translate.py diff --git a/src/documents/search/__init__.py b/src/documents/search/__init__.py index 3f231b9c7..f0e65e5cd 100644 --- a/src/documents/search/__init__.py +++ b/src/documents/search/__init__.py @@ -8,11 +8,15 @@ from documents.search._backend import get_backend from documents.search._backend import reset_backend from documents.search._schema import needs_rebuild from documents.search._schema import wipe_index +from documents.search._translate import InvalidDateQuery +from documents.search._translate import SearchQueryError __all__ = [ + "InvalidDateQuery", "SearchHit", "SearchIndexLockError", "SearchMode", + "SearchQueryError", "TantivyBackend", "TantivyRelevanceList", "WriteBatch", diff --git a/src/documents/search/_dates.py b/src/documents/search/_dates.py new file mode 100644 index 000000000..a94935022 --- /dev/null +++ b/src/documents/search/_dates.py @@ -0,0 +1,163 @@ +from __future__ import annotations + +from datetime import UTC +from datetime import date +from datetime import datetime +from datetime import timedelta +from typing import TYPE_CHECKING +from typing import Final + +from dateutil.relativedelta import relativedelta + +if TYPE_CHECKING: + from datetime import tzinfo + +_DATE_ONLY_FIELDS = frozenset({"created"}) + +_TODAY: Final[str] = "today" +_YESTERDAY: Final[str] = "yesterday" +_PREVIOUS_WEEK: Final[str] = "previous week" +_THIS_MONTH: Final[str] = "this month" +_PREVIOUS_MONTH: Final[str] = "previous month" +_THIS_YEAR: Final[str] = "this year" +_PREVIOUS_YEAR: Final[str] = "previous year" +_PREVIOUS_QUARTER: Final[str] = "previous quarter" + +_DATE_KEYWORDS = frozenset( + { + _TODAY, + _YESTERDAY, + _PREVIOUS_WEEK, + _THIS_MONTH, + _PREVIOUS_MONTH, + _THIS_YEAR, + _PREVIOUS_YEAR, + _PREVIOUS_QUARTER, + }, +) + + +def _fmt(dt: datetime) -> str: + """Format a datetime as an ISO 8601 UTC string for use in Tantivy range queries.""" + return dt.astimezone(UTC).strftime("%Y-%m-%dT%H:%M:%SZ") + + +def _iso_range(lo: datetime, hi: datetime) -> str: + """Format a [lo TO hi] range string in ISO 8601 for Tantivy query syntax.""" + return f"[{_fmt(lo)} TO {_fmt(hi)}]" + + +def _quarter_start(d: date) -> date: + """Return the first day of the calendar quarter containing ``d``.""" + return date(d.year, ((d.month - 1) // 3) * 3 + 1, 1) + + +def _midnight(d: date, tz: tzinfo) -> datetime: + """Convert a calendar date at local-timezone midnight to a UTC datetime.""" + return datetime(d.year, d.month, d.day, tzinfo=tz).astimezone(UTC) + + +def _keyword_bounds(keyword: str, tz: tzinfo) -> tuple[date, date]: + """ + Map a relative date keyword to ``(start, exclusive_end)`` calendar dates. + + ``tz`` only determines what "today" is; the caller decides how the returned + dates become UTC datetime boundaries (date-only vs. local-midnight offset). + """ + today = datetime.now(tz).date() + if keyword == _TODAY: + return today, today + timedelta(days=1) + if keyword == _YESTERDAY: + return today - timedelta(days=1), today + if keyword == _PREVIOUS_WEEK: + this_monday = today - timedelta(days=today.weekday()) + return this_monday - timedelta(weeks=1), this_monday + if keyword == _THIS_MONTH: + first = today.replace(day=1) + return first, first + relativedelta(months=1) + if keyword == _PREVIOUS_MONTH: + this_first = today.replace(day=1) + return this_first - relativedelta(months=1), this_first + if keyword == _THIS_YEAR: + return date(today.year, 1, 1), date(today.year + 1, 1, 1) + if keyword == _PREVIOUS_YEAR: + return date(today.year - 1, 1, 1), date(today.year, 1, 1) + if keyword == _PREVIOUS_QUARTER: + this_quarter = _quarter_start(today) + return this_quarter - relativedelta(months=3), this_quarter + raise ValueError(f"Unknown keyword: {keyword}") + + +def _date_only_range(keyword: str, tz: tzinfo) -> str: + """ + For `created` (DateField): use the local calendar date, converted to + midnight UTC boundaries. No offset arithmetic — date only. + """ + start, end = _keyword_bounds(keyword, tz) + lo = datetime(start.year, start.month, start.day, tzinfo=UTC) + hi = datetime(end.year, end.month, end.day, tzinfo=UTC) + return _iso_range(lo, hi) + + +def _datetime_range(keyword: str, tz: tzinfo) -> str: + """ + For `added` / `modified` (DateTimeField, stored as UTC): convert local day + boundaries to UTC — full offset arithmetic required. + """ + start, end = _keyword_bounds(keyword, tz) + return _iso_range(_midnight(start, tz), _midnight(end, tz)) + + +def _precision_bounds(digits: str) -> tuple[date, date] | None: + """ + Map a 4/6/8-digit date token to (start, exclusive_end) calendar dates. + + YYYY -> whole year, YYYYMM -> whole month, YYYYMMDD -> single day. + Returns None for any unparsable or out-of-range value (e.g. month 23), + so callers can emit a no-match clause instead of erroring (Whoosh parity). + """ + try: + if len(digits) == 4: + year = int(digits) + return date(year, 1, 1), date(year + 1, 1, 1) + if len(digits) == 6: + year, month = int(digits[:4]), int(digits[4:6]) + start = date(year, month, 1) + end = date(year + 1, 1, 1) if month == 12 else date(year, month + 1, 1) + return start, end + if len(digits) == 8: + start = date(int(digits[:4]), int(digits[4:6]), int(digits[6:8])) + return start, start + timedelta(days=1) + except ValueError: + return None + return None + + +def _utc_bounds_for_field( + field: str, + start: date, + end: date, + tz: tzinfo, +) -> tuple[datetime, datetime]: + """ + Convert calendar-date bounds to UTC datetimes per the field's storage type. + + For DateField (``created``) the bounds are UTC midnight (no offset). For + DateTimeField (``added``/``modified``) the bounds are local-tz midnight + converted to UTC, matching how each field is indexed. + """ + if field in _DATE_ONLY_FIELDS: + return ( + datetime(start.year, start.month, start.day, tzinfo=UTC), + datetime(end.year, end.month, end.day, tzinfo=UTC), + ) + return ( + datetime(start.year, start.month, start.day, tzinfo=tz).astimezone(UTC), + datetime(end.year, end.month, end.day, tzinfo=tz).astimezone(UTC), + ) + + +def _field_range_from_dates(field: str, start: date, end: date, tz: tzinfo) -> str: + """Build a Tantivy ``field:[lo TO hi]`` ISO range from calendar-date bounds.""" + lo, hi = _utc_bounds_for_field(field, start, end, tz) + return f"{field}:{_iso_range(lo, hi)}" diff --git a/src/documents/search/_query.py b/src/documents/search/_query.py index 932d68bc1..749098447 100644 --- a/src/documents/search/_query.py +++ b/src/documents/search/_query.py @@ -1,88 +1,35 @@ from __future__ import annotations +import logging from datetime import UTC -from datetime import date -from datetime import datetime -from datetime import timedelta from typing import TYPE_CHECKING from typing import Final import regex import tantivy -from dateutil.relativedelta import relativedelta from django.conf import settings +from documents.search._dates import ( + _date_only_range, # noqa: F401 — re-exported for test imports +) +from documents.search._dates import ( + _datetime_range, # noqa: F401 — re-exported for test imports +) from documents.search._tokenizer import simple_search_tokens +from documents.search._translate import SearchQueryError +from documents.search._translate import translate_query if TYPE_CHECKING: from datetime import tzinfo from django.contrib.auth.base_user import AbstractBaseUser +logger = logging.getLogger("paperless.search") + # Maximum seconds any single regex substitution may run. # Prevents ReDoS on adversarial user-supplied query strings. _REGEX_TIMEOUT: Final[float] = 1.0 -_DATE_ONLY_FIELDS = frozenset({"created"}) - -_TODAY: Final[str] = "today" -_YESTERDAY: Final[str] = "yesterday" -_PREVIOUS_WEEK: Final[str] = "previous week" -_THIS_MONTH: Final[str] = "this month" -_PREVIOUS_MONTH: Final[str] = "previous month" -_THIS_YEAR: Final[str] = "this year" -_PREVIOUS_YEAR: Final[str] = "previous year" -_PREVIOUS_QUARTER: Final[str] = "previous quarter" - -_DATE_KEYWORDS = frozenset( - { - _TODAY, - _YESTERDAY, - _PREVIOUS_WEEK, - _THIS_MONTH, - _PREVIOUS_MONTH, - _THIS_YEAR, - _PREVIOUS_YEAR, - _PREVIOUS_QUARTER, - }, -) - -_DATE_KEYWORD_PATTERN = "|".join( - sorted((regex.escape(k) for k in _DATE_KEYWORDS), key=len, reverse=True), -) - -_FIELD_DATE_RE = regex.compile( - rf"""(?created|modified|added)\s*:\s*(?: - (?P["'])(?P{_DATE_KEYWORD_PATTERN})(?P=quote) - | - (?P{_DATE_KEYWORD_PATTERN})(?![\w-]) -)""", - regex.IGNORECASE | regex.VERBOSE, -) -_COMPACT_DATE_RE = regex.compile(r"\b(\d{14})\b") -_RELATIVE_RANGE_RE = regex.compile( - r"\[now([+-]\d+[dhm])?\s+TO\s+now([+-]\d+[dhm])?\]", - regex.IGNORECASE, -) -# Whoosh-style relative date range: e.g. [-1 week to now], [-7 days to now] -_WHOOSH_REL_RANGE_RE = regex.compile( - r"\[-(?P\d+)\s+(?Psecond|minute|hour|day|week|month|year)s?\s+to\s+now\]", - regex.IGNORECASE, -) -# Whoosh-style 8-digit date: field:YYYYMMDD — field-aware so timezone can be applied correctly. -# Scoped to date fields only; numeric fields (asn, id, page_count, ...) must not be rewritten. -_DATE8_RE = regex.compile( - r"(?created|modified|added):(?P\d{8})\b", -) -_YEAR_RANGE_RE = regex.compile( - r"(?created|modified|added):\[(?P\d{4})\s+TO\s+(?P\d{4})\]", - regex.IGNORECASE, -) -# Tantivy syntax error: " - " and " + " with spaces on both sides are invalid because -# the NOT/MUST operators require no space between the operator and the term. -# In natural-language queries (e.g., "H52.1 - Kurzsichtigkeit"), the dash is a separator. -_SPACED_OPERATOR_RE = regex.compile(r"\s+[-+]\s+") -_TRAILING_OPERATOR_RE = regex.compile(r"\s+[-+]+\s*$") # Matches CJK/Hangul characters so queries can be routed to bigram fields. # Uses Unicode properties to cover all blocks including Extension B+ planes. _CJK_RE: Final = regex.compile(r"[\p{Han}\p{Hiragana}\p{Katakana}\p{Hangul}]+") @@ -117,303 +64,12 @@ def _build_cjk_query( return None -def _fmt(dt: datetime) -> str: - """Format a datetime as an ISO 8601 UTC string for use in Tantivy range queries.""" - return dt.astimezone(UTC).strftime("%Y-%m-%dT%H:%M:%SZ") - - -def _iso_range(lo: datetime, hi: datetime) -> str: - """Format a [lo TO hi] range string in ISO 8601 for Tantivy query syntax.""" - return f"[{_fmt(lo)} TO {_fmt(hi)}]" - - -def _date_only_range(keyword: str, tz: tzinfo) -> str: - """ - For `created` (DateField): use the local calendar date, converted to - midnight UTC boundaries. No offset arithmetic — date only. - """ - - today = datetime.now(tz).date() - - def _quarter_start(d: date) -> date: - return date(d.year, ((d.month - 1) // 3) * 3 + 1, 1) - - if keyword == _TODAY: - lo = datetime(today.year, today.month, today.day, tzinfo=UTC) - return _iso_range(lo, lo + timedelta(days=1)) - if keyword == _YESTERDAY: - y = today - timedelta(days=1) - lo = datetime(y.year, y.month, y.day, tzinfo=UTC) - hi = datetime(today.year, today.month, today.day, tzinfo=UTC) - return _iso_range(lo, hi) - if keyword == _PREVIOUS_WEEK: - this_mon = today - timedelta(days=today.weekday()) - last_mon = this_mon - timedelta(weeks=1) - lo = datetime(last_mon.year, last_mon.month, last_mon.day, tzinfo=UTC) - hi = datetime(this_mon.year, this_mon.month, this_mon.day, tzinfo=UTC) - return _iso_range(lo, hi) - if keyword == _THIS_MONTH: - lo = datetime(today.year, today.month, 1, tzinfo=UTC) - if today.month == 12: - hi = datetime(today.year + 1, 1, 1, tzinfo=UTC) - else: - hi = datetime(today.year, today.month + 1, 1, tzinfo=UTC) - return _iso_range(lo, hi) - if keyword == _PREVIOUS_MONTH: - if today.month == 1: - lo = datetime(today.year - 1, 12, 1, tzinfo=UTC) - else: - lo = datetime(today.year, today.month - 1, 1, tzinfo=UTC) - hi = datetime(today.year, today.month, 1, tzinfo=UTC) - return _iso_range(lo, hi) - if keyword == _THIS_YEAR: - lo = datetime(today.year, 1, 1, tzinfo=UTC) - return _iso_range(lo, datetime(today.year + 1, 1, 1, tzinfo=UTC)) - if keyword == _PREVIOUS_YEAR: - lo = datetime(today.year - 1, 1, 1, tzinfo=UTC) - return _iso_range(lo, datetime(today.year, 1, 1, tzinfo=UTC)) - if keyword == _PREVIOUS_QUARTER: - this_quarter = _quarter_start(today) - last_quarter = this_quarter - relativedelta(months=3) - lo = datetime( - last_quarter.year, - last_quarter.month, - last_quarter.day, - tzinfo=UTC, - ) - hi = datetime( - this_quarter.year, - this_quarter.month, - this_quarter.day, - tzinfo=UTC, - ) - return _iso_range(lo, hi) - raise ValueError(f"Unknown keyword: {keyword}") - - -def _datetime_range(keyword: str, tz: tzinfo) -> str: - """ - For `added` / `modified` (DateTimeField, stored as UTC): convert local day - boundaries to UTC — full offset arithmetic required. - """ - - now_local = datetime.now(tz) - today = now_local.date() - - def _midnight(d: date) -> datetime: - return datetime(d.year, d.month, d.day, tzinfo=tz).astimezone(UTC) - - def _quarter_start(d: date) -> date: - return date(d.year, ((d.month - 1) // 3) * 3 + 1, 1) - - if keyword == _TODAY: - return _iso_range(_midnight(today), _midnight(today + timedelta(days=1))) - if keyword == _YESTERDAY: - y = today - timedelta(days=1) - return _iso_range(_midnight(y), _midnight(today)) - if keyword == _PREVIOUS_WEEK: - this_mon = today - timedelta(days=today.weekday()) - last_mon = this_mon - timedelta(weeks=1) - return _iso_range(_midnight(last_mon), _midnight(this_mon)) - if keyword == _THIS_MONTH: - first = today.replace(day=1) - if today.month == 12: - next_first = date(today.year + 1, 1, 1) - else: - next_first = date(today.year, today.month + 1, 1) - return _iso_range(_midnight(first), _midnight(next_first)) - if keyword == _PREVIOUS_MONTH: - this_first = today.replace(day=1) - if today.month == 1: - last_first = date(today.year - 1, 12, 1) - else: - last_first = date(today.year, today.month - 1, 1) - return _iso_range(_midnight(last_first), _midnight(this_first)) - if keyword == _THIS_YEAR: - return _iso_range( - _midnight(date(today.year, 1, 1)), - _midnight(date(today.year + 1, 1, 1)), - ) - if keyword == _PREVIOUS_YEAR: - return _iso_range( - _midnight(date(today.year - 1, 1, 1)), - _midnight(date(today.year, 1, 1)), - ) - if keyword == _PREVIOUS_QUARTER: - this_quarter = _quarter_start(today) - last_quarter = this_quarter - relativedelta(months=3) - return _iso_range(_midnight(last_quarter), _midnight(this_quarter)) - raise ValueError(f"Unknown keyword: {keyword}") - - -def _rewrite_compact_date(query: str) -> str: - """Rewrite Whoosh compact date tokens (14-digit YYYYMMDDHHmmss) to ISO 8601.""" - - def _sub(m: regex.Match[str]) -> str: - raw = m.group(1) - try: - dt = datetime( - int(raw[0:4]), - int(raw[4:6]), - int(raw[6:8]), - int(raw[8:10]), - int(raw[10:12]), - int(raw[12:14]), - tzinfo=UTC, - ) - return dt.strftime("%Y-%m-%dT%H:%M:%SZ") - except ValueError: - return str(m.group(0)) - - try: - return _COMPACT_DATE_RE.sub(_sub, query, timeout=_REGEX_TIMEOUT) - except TimeoutError: # pragma: no cover - raise ValueError( - "Query too complex to process (compact date rewrite timed out)", - ) - - -def _rewrite_relative_range(query: str) -> str: - """Rewrite Whoosh relative ranges ([now-7d TO now]) to concrete ISO 8601 UTC boundaries.""" - - def _sub(m: regex.Match[str]) -> str: - now = datetime.now(UTC) - - def _offset(s: str | None) -> timedelta: - if not s: - return timedelta(0) - sign = 1 if s[0] == "+" else -1 - n, unit = int(s[1:-1]), s[-1] - return ( - sign - * { - "d": timedelta(days=n), - "h": timedelta(hours=n), - "m": timedelta(minutes=n), - }[unit] - ) - - lo, hi = now + _offset(m.group(1)), now + _offset(m.group(2)) - if lo > hi: - lo, hi = hi, lo - return f"[{_fmt(lo)} TO {_fmt(hi)}]" - - try: - return _RELATIVE_RANGE_RE.sub(_sub, query, timeout=_REGEX_TIMEOUT) - except TimeoutError: # pragma: no cover - raise ValueError( - "Query too complex to process (relative range rewrite timed out)", - ) - - -def _rewrite_whoosh_relative_range(query: str) -> str: - """Rewrite Whoosh-style relative date ranges ([-N unit to now]) to ISO 8601. - - Supports: second, minute, hour, day, week, month, year (singular and plural). - Example: ``added:[-1 week to now]`` → ``added:[2025-01-01T… TO 2025-01-08T…]`` - """ - now = datetime.now(UTC) - - def _sub(m: regex.Match[str]) -> str: - n = int(m.group("n")) - unit = m.group("unit").lower() - delta_map: dict[str, timedelta | relativedelta] = { - "second": timedelta(seconds=n), - "minute": timedelta(minutes=n), - "hour": timedelta(hours=n), - "day": timedelta(days=n), - "week": timedelta(weeks=n), - "month": relativedelta(months=n), - "year": relativedelta(years=n), - } - lo = now - delta_map[unit] - return f"[{_fmt(lo)} TO {_fmt(now)}]" - - try: - return _WHOOSH_REL_RANGE_RE.sub(_sub, query, timeout=_REGEX_TIMEOUT) - except TimeoutError: # pragma: no cover - raise ValueError( - "Query too complex to process (Whoosh relative range rewrite timed out)", - ) - - -def _rewrite_8digit_date(query: str, tz: tzinfo) -> str: - """Rewrite field:YYYYMMDD date tokens to an ISO 8601 day range. - - Runs after ``_rewrite_compact_date`` so 14-digit timestamps are already - converted and won't spuriously match here. - - For DateField fields (e.g. ``created``) uses UTC midnight boundaries. - For DateTimeField fields (e.g. ``added``, ``modified``) uses local TZ - midnight boundaries converted to UTC — matching the ``_datetime_range`` - behaviour for keyword dates. - """ - - def _sub(m: regex.Match[str]) -> str: - field = m.group("field") - raw = m.group("date8") - try: - year, month, day = int(raw[0:4]), int(raw[4:6]), int(raw[6:8]) - d = date(year, month, day) - if field in _DATE_ONLY_FIELDS: - lo = datetime(d.year, d.month, d.day, tzinfo=UTC) - hi = lo + timedelta(days=1) - else: - # DateTimeField: use local-timezone midnight → UTC - lo = datetime(d.year, d.month, d.day, tzinfo=tz).astimezone(UTC) - hi = datetime( - (d + timedelta(days=1)).year, - (d + timedelta(days=1)).month, - (d + timedelta(days=1)).day, - tzinfo=tz, - ).astimezone(UTC) - return f"{field}:[{_fmt(lo)} TO {_fmt(hi)}]" - except ValueError: - return m.group(0) - - try: - return _DATE8_RE.sub(_sub, query, timeout=_REGEX_TIMEOUT) - except TimeoutError: # pragma: no cover - raise ValueError( - "Query too complex to process (8-digit date rewrite timed out)", - ) - - -def _rewrite_year_range(query: str) -> str: - """Rewrite Whoosh-style year-only date ranges to ISO 8601 UTC boundaries. - - Converts ``field:[YYYY TO YYYY]`` to a full ISO 8601 datetime range. - The upper bound is the start of the year after the end year (exclusive), - matching the Whoosh convention of treating year-only ranges as full-year spans. - """ - - def _sub(m: regex.Match[str]) -> str: - field = m.group("field") - y1, y2 = int(m.group("y1")), int(m.group("y2")) - # Whoosh swaps a reversed range when both years are explicit - # (whoosh.util.times.timespan.disambiguated); match that so a backwards - # range spans the intended years instead of matching nothing. - lo_year, hi_year = min(y1, y2), max(y1, y2) - lo = datetime(lo_year, 1, 1, tzinfo=UTC) - hi = datetime(hi_year + 1, 1, 1, tzinfo=UTC) - return f"{field}:[{_fmt(lo)} TO {_fmt(hi)}]" - - try: - return _YEAR_RANGE_RE.sub(_sub, query, timeout=_REGEX_TIMEOUT) - except TimeoutError: # pragma: no cover - raise ValueError("Query too complex to process (year range rewrite timed out)") - - def rewrite_natural_date_keywords(query: str, tz: tzinfo) -> str: """ Rewrite natural date syntax to ISO 8601 format for Tantivy compatibility. - Performs the first stage of query preprocessing, converting various date - formats and keywords to ISO 8601 datetime ranges that Tantivy can parse: - - Compact 14-digit dates (YYYYMMDDHHmmss) - - Whoosh relative ranges ([-7 days to now], [now-1h TO now+2h]) - - 8-digit dates with field awareness (created:20240115) - - Natural keywords (field:today, field:"previous quarter", etc.) + Delegates to ``translate_query`` which handles all date forms, comma + expansion, field aliasing, relative ranges, and operator normalization. Args: query: Raw user query string @@ -425,35 +81,15 @@ def rewrite_natural_date_keywords(query: str, tz: tzinfo) -> str: Note: Bare keywords without field prefixes pass through unchanged. """ - query = _rewrite_compact_date(query) - query = _rewrite_whoosh_relative_range(query) - query = _rewrite_year_range(query) - query = _rewrite_8digit_date(query, tz) - query = _rewrite_relative_range(query) - - def _replace(m: regex.Match[str]) -> str: - field = m.group("field") - keyword = (m.group("quoted") or m.group("bare")).lower() - if field in _DATE_ONLY_FIELDS: - return f"{field}:{_date_only_range(keyword, tz)}" - return f"{field}:{_datetime_range(keyword, tz)}" - - try: - return _FIELD_DATE_RE.sub(_replace, query, timeout=_REGEX_TIMEOUT) - except TimeoutError: # pragma: no cover - raise ValueError( - "Query too complex to process (date keyword rewrite timed out)", - ) + return translate_query(query, tz) def normalize_query(query: str) -> str: """ Normalize query syntax for better search behavior. - Expands comma-separated field values to explicit AND clauses and - collapses excessive whitespace for cleaner parsing: - - tag:foo,bar → tag:foo AND tag:bar - - multiple spaces → single spaces + Delegates to ``translate_query`` which handles comma expansion, whitespace + collapsing, operator normalization, and field aliasing. Args: query: Query string after date rewriting @@ -461,29 +97,7 @@ def normalize_query(query: str) -> str: Returns: Normalized query string ready for Tantivy parsing """ - - def _expand(m: regex.Match[str]) -> str: - field = m.group(1) - values = [v.strip() for v in m.group(2).split(",") if v.strip()] - return " AND ".join(f"{field}:{v}" for v in values) - - try: - query = regex.sub( - r"(\w+):([^\s\[\]]+(?:,[^\s\[\]]+)+)", - _expand, - query, - timeout=_REGEX_TIMEOUT, - ) - query = regex.sub(r" {2,}", " ", query, timeout=_REGEX_TIMEOUT).strip() - # Strip trailing dangling operators before Tantivy sees them. - query = _TRAILING_OPERATOR_RE.sub("", query, timeout=_REGEX_TIMEOUT).strip() - # Replace " - " / " + " with a space: Tantivy requires no space between - # the operator and its operand (-term / +term), so spaces on both sides - # means this is a natural-language separator, not a query operator. - query = _SPACED_OPERATOR_RE.sub(" ", query, timeout=_REGEX_TIMEOUT).strip() - return query - except TimeoutError: # pragma: no cover - raise ValueError("Query too complex to process (normalization timed out)") + return translate_query(query, UTC) def build_permission_filter( @@ -603,8 +217,16 @@ def parse_user_query( as a post-search score filter, not during query construction. """ - query_str = rewrite_natural_date_keywords(raw_query, tz) - query_str = normalize_query(query_str) + try: + query_str = translate_query(raw_query, tz) + except SearchQueryError: + # Intentional, user-fixable error (e.g. an unparsable date). Propagate so + # the view can return a 400 with a helpful message rather than falling + # back to the raw (still-invalid) query. + raise + except Exception: # pragma: no cover - defensive + logger.warning("Query translation failed; using raw query", exc_info=True) + query_str = raw_query exact = index.parse_query( query_str, diff --git a/src/documents/search/_translate.py b/src/documents/search/_translate.py new file mode 100644 index 000000000..f25c9c0f2 --- /dev/null +++ b/src/documents/search/_translate.py @@ -0,0 +1,566 @@ +from __future__ import annotations + +from dataclasses import dataclass +from datetime import UTC +from datetime import datetime +from datetime import timedelta +from typing import TYPE_CHECKING +from typing import TypeAlias + +import regex +from dateutil.relativedelta import relativedelta + +from documents.search._dates import _DATE_KEYWORDS +from documents.search._dates import _DATE_ONLY_FIELDS +from documents.search._dates import _date_only_range +from documents.search._dates import _datetime_range +from documents.search._dates import _field_range_from_dates +from documents.search._dates import _fmt +from documents.search._dates import _precision_bounds +from documents.search._dates import _utc_bounds_for_field + +# Compiled regex that matches any known multi-word (or single-word) date keyword +# at the start of a match position, longest alternatives first so "previous week" +# wins over a hypothetical shorter "previous". +_KEYWORD_VALUE_RE = regex.compile( + "|".join(sorted((regex.escape(k) for k in _DATE_KEYWORDS), key=len, reverse=True)), + regex.IGNORECASE, +) + +if TYPE_CHECKING: + from datetime import tzinfo + +# TODO: this module translates date queries into Tantivy *string* syntax, which +# forces a workaround for something Tantivy's string parser cannot express on +# date fields: open-ended ranges use far-past/far-future string sentinels +# (OPEN_LO/OPEN_HI). These can be replaced with a real tantivy.Query object +# (Query.range_query(..., None) for open bounds) once tantivy-py accepts Python +# datetimes in range_query/term_query on Date fields. That support exists on +# tantivy-py master (PRs #655 + #666) but postdates the pinned 0.26.0 wheel, so +# it is blocked only on a published release > 0.26.0 and a dependency bump. +# (Unparsable dates now raise InvalidDateQuery -> HTTP 400 rather than using a +# no-match string sentinel.) + +# Fields that store exact, non-analyzed comma-joined tokens in the index and so +# need explicit comma->AND expansion (Whoosh KEYWORD(commas=True) set). +MULTI_VALUE_FIELDS = frozenset({"tag", "tag_id", "viewer_id"}) + +# Date fields whose values/ranges get rewritten to RFC3339 Tantivy ranges. +DATE_FIELDS = frozenset({"created", "modified", "added"}) + +# Field aliases: Whoosh (v2) field names that were renamed in the Tantivy schema. +# Preserved here so v2 queries using the old names continue to work without 400 +# errors instead of silently failing. Applied by _render to non-date field tokens. +FIELD_ALIASES: dict[str, str] = { + "type": "document_type", + "type_id": "document_type_id", + "path": "storage_path", + "path_id": "storage_path_id", +} + +# Known schema fields: a comma immediately followed by ``:`` is a clause +# separator. Restricting to known fields prevents URL-like ``http:`` misfires. +KNOWN_FIELDS = frozenset( + { + "title", + "content", + "correspondent", + "document_type", + "type", # v2 alias -> document_type + "storage_path", + "path", # v2 alias -> storage_path + "tag", + "tag_id", + "correspondent_id", + "document_type_id", + "type_id", # v2 alias -> document_type_id + "storage_path_id", + "path_id", # v2 alias -> storage_path_id + "owner_id", + "viewer_id", + "asn", + "page_count", + "num_notes", + "created", + "modified", + "added", + "original_filename", + "checksum", + "notes", + "custom_fields", + }, +) + +_FIELD_RE = regex.compile(r"(?P\w+):") + +# Matches the TO separator inside a range bracket. Handles three forms: +# middle: "lo TO hi" (either lo or hi may be empty) +# trailing: "lo TO" (open upper bound) +# leading: "TO hi" (open lower bound) +# Bounds MAY contain internal spaces (e.g. "-7 days"), so we use .*? / .+? +# and split on the whitespace-delimited " TO " / " to " separator. +_RANGE_RE = regex.compile( + r"^\s*(?P.*?)\s+[Tt][Oo]\s+(?P.+?)\s*$" + r"|" + r"^\s*(?P.+?)\s+[Tt][Oo]\s*$" + r"|" + r"^\s*[Tt][Oo]\s+(?P.+?)\s*$", +) + + +@dataclass(frozen=True, slots=True) +class FieldValue: + field: str + value: str + + +# Produced by the comma-resolution pass (not by scan()). +@dataclass(frozen=True, slots=True) +class FieldValueList: + field: str + values: tuple[str, ...] + + +@dataclass(frozen=True, slots=True) +class FieldRange: + field: str + open: str + lo: str + hi: str + close: str + + +# Produced by the comma-resolution pass (not by scan()). +@dataclass(frozen=True, slots=True) +class Comma: + pass + + +@dataclass(frozen=True, slots=True) +class Passthrough: + raw: str + + +Token: TypeAlias = FieldValue | FieldValueList | FieldRange | Comma | Passthrough + +_CLOSE: dict[str, str] = {"[": "]", "{": "}"} + + +def scan(query: str) -> list[Token]: + """ + Tokenize a raw query into date/comma-aware tokens, leaving everything else + as verbatim ``Passthrough`` runs. Non-recursive: finds the first matching + close bracket/quote. Nested brackets are not valid Tantivy range syntax and + pass through verbatim on mismatch. + """ + tokens: list[Token] = [] + buf: list[str] = [] # accumulates passthrough chars + i, n = 0, len(query) + while i < n: + matched = _match_field_token(query, i) + if matched is None: + buf.append(query[i]) + i += 1 + continue + token, i = matched + _flush(buf, tokens) + tokens.append(token) + i = _maybe_comma(query, i, tokens) + _flush(buf, tokens) + return tokens + + +def _flush(buf: list[str], tokens: list[Token]) -> None: + """Emit any accumulated passthrough characters as a single token.""" + if buf: + tokens.append(Passthrough("".join(buf))) + buf.clear() + + +def _at_word_boundary(query: str, i: int) -> bool: + """A field token may begin only at the start or after a non-word character.""" + return i == 0 or not (query[i - 1].isalnum() or query[i - 1] == "_") + + +def _match_field_token(query: str, i: int) -> tuple[Token, int] | None: + """ + If a known ``field:`` token starts at ``i``, consume it and return + ``(token, end_index)``; otherwise return None so the caller treats the + character as passthrough. Handles both ``field:[range]`` and ``field:value``, + and returns None when the range/value cannot be consumed. + """ + m = _FIELD_RE.match(query, i) + if m is None or m.group("field") not in KNOWN_FIELDS: + return None + if not _at_word_boundary(query, i): + return None + field = m.group("field") + j = m.end() + if j < len(query) and query[j] in "[{": + return _consume_range(query, j, field) + consumed = _consume_field_value(query, field, j) + if consumed is None: + return None + value, end = consumed + return FieldValue(field, value), end + + +def _consume_field_value(query: str, field: str, start: int) -> tuple[str, int] | None: + """ + Consume a field value starting at ``start``: a multi-word date keyword phrase + (date fields only), or a bare/quoted value, then absorb any comma-joined + continuation that is not a clause separator. ``resolve_commas`` later splits a + multi-value field's joined value into a ``FieldValueList``; for other fields + the comma stays literal. + """ + n = len(query) + consumed = None + if field in DATE_FIELDS: + km = _KEYWORD_VALUE_RE.match(query, start) + if km is not None and (km.end() >= n or query[km.end()] in " \t),"): + consumed = (km.group(0), km.end()) + if consumed is None: + consumed = _consume_value(query, start) + if consumed is None: + return None + value, k = consumed + while k < n and query[k] == ",": + if _looks_like_known_field(query, k + 1): + break # clause separator: left for _maybe_comma to emit a Comma() + more = _consume_value(query, k + 1) + if more is None: + break + value = f"{value},{more[0]}" + k = more[1] + return value, k + + +def _consume_range( + query: str, + start: int, + field: str, +) -> tuple[FieldRange, int] | None: + """Consume ``[lo TO hi]`` / ``{lo TO hi}`` from ``start`` (the bracket).""" + open_br = query[start] + close_br = _CLOSE[open_br] + end = query.find(close_br, start + 1) + if end == -1: + return None + inner = query[start + 1 : end] + m = _RANGE_RE.match(inner) + if m is not None: + if m.group("lo") is not None or m.group("hi") is not None: + # Middle form: "lo TO hi" (either may be empty string) + lo = (m.group("lo") or "").strip() + hi = (m.group("hi") or "").strip() + elif m.group("lo2") is not None: + # Trailing form: "lo TO" + lo = m.group("lo2").strip() + hi = "" + else: + # Leading form: "TO hi" + lo = "" + hi = (m.group("hi2") or "").strip() + else: + lo, hi = inner.strip(), "" + return FieldRange(field, open_br, lo, hi, close_br), end + 1 + + +def _consume_value(query: str, start: int) -> tuple[str, int] | None: + """Consume a bare or quoted field value from ``start``, stopping at comma.""" + n = len(query) + if start >= n or query[start] in " \t": + return None + if query[start] in "\"'": + quote = query[start] + end = query.find(quote, start + 1) + if end == -1: + return None + return query[start : end + 1], end + 1 + j = start + while j < n and query[j] not in " \t),": + j += 1 + return query[start:j], j + + +def _looks_like_known_field(query: str, pos: int) -> bool: + """True if a known ``field:`` token starts at ``pos``.""" + m = _FIELD_RE.match(query, pos) + return bool(m and m.group("field") in KNOWN_FIELDS) + + +def _maybe_comma(query: str, i: int, tokens: list) -> int: + """If a clause-separator comma follows at ``i``, emit ``Comma()`` and advance.""" + if i < len(query) and query[i] == "," and _looks_like_known_field(query, i + 1): + tokens.append(Comma()) + return i + 1 + return i + + +def resolve_commas(tokens: list) -> list: + """ + Collapse value-list commas into ``FieldValueList`` and keep clause-separator + commas as ``Comma``. (Clause-sep commas are already emitted by ``scan`` via + the value-stop logic; this pass folds value-lists.) + """ + out: list = [] + for tok in tokens: + if ( + isinstance(tok, FieldValue) + and tok.field in MULTI_VALUE_FIELDS + and "," in tok.value + ): + values = tuple(v for v in tok.value.split(",") if v) + out.append(FieldValueList(tok.field, values)) + else: + out.append(tok) + return out + + +class SearchQueryError(ValueError): + """ + Base for user-fixable search query errors. + + Carries a message safe to surface to the user (no internal details). The view + layer catches this and returns an HTTP 400, so any future subclass (unknown + field, malformed range, wrapped parser errors) gets the same treatment. + """ + + +class InvalidDateQuery(SearchQueryError): + """Raised when a date field value or range bound cannot be parsed.""" + + def __init__(self, field: str, value: str) -> None: + self.field = field + self.value = value + super().__init__(f"Invalid date value {value!r} for field {field!r}.") + + +_DIGITS_RE = regex.compile(r"^\d{4}(?:\d{2}){0,2}$") +_ISO_RE = regex.compile(r"^\d{4}(?:-\d{2}(?:-\d{2})?)?$") + + +def translate_scalar(field: str, value: str, tz: tzinfo) -> str: + """Translate a bare date-field value to a Tantivy range string.""" + bare = value.strip("\"'").lower() + if bare in _DATE_KEYWORDS: + if field in _DATE_ONLY_FIELDS: + return f"{field}:{_date_only_range(bare, tz)}" + return f"{field}:{_datetime_range(bare, tz)}" + digits = value.replace("-", "") + if _DIGITS_RE.match(value) or _ISO_RE.match(value): + bounds = _precision_bounds(digits) + if bounds is None: + raise InvalidDateQuery(field, value) + return _field_range_from_dates(field, bounds[0], bounds[1], tz) + if regex.fullmatch(r"\d{14}", value): + try: + dt = datetime( + int(value[0:4]), + int(value[4:6]), + int(value[6:8]), + int(value[8:10]), + int(value[10:12]), + int(value[12:14]), + tzinfo=UTC, + ) + except ValueError: + raise InvalidDateQuery(field, value) from None + iso = _fmt(dt) + return f"{field}:[{iso} TO {iso}]" + # Unrecognized shape -> tell the user their date is malformed rather than + # silently matching nothing or emitting invalid Tantivy syntax. + raise InvalidDateQuery(field, value) + + +# Open-bound sentinels for date ranges. These far-past/far-future strings allow +# open-ended ranges to be expressed as Tantivy string queries until tantivy-py +# exposes Query.range_query(..., None) on Date fields (see module TODO). +OPEN_LO = "0001-01-01T00:00:00Z" +OPEN_HI = "9999-12-31T23:59:59Z" + + +# Matches compact now-offset tokens like now-7d, now+1h, now-30m. +_NOW_COMPACT_RE = regex.compile( + r"^now(?P[+-])(?P\d+)(?P[dhm])$", + regex.IGNORECASE, +) + +# Matches "±N " Whoosh-style offsets (e.g. -7 days, -1 week, +3 hours) +# Unit is singular or plural; sign prefix is mandatory. +_NOW_SPACED_RE = regex.compile( + r"^(?P[+-])(?P\d+)\s*" + r"(?Psecond|minute|hour|day|week|month|year)s?$", + regex.IGNORECASE, +) + + +def _resolve_relative_bound(token: str) -> datetime | None: + """ + Resolve a relative bound token to an exact UTC instant, or return None. + + Supported forms: + - ``now`` -> current UTC instant + - ``now+/-d/h/m`` -> now +/- timedelta (d=days, h=hours, m=minutes) + - ``±N `` -> now +/- delta; month/year use relativedelta + """ + stripped = token.strip() + low = stripped.lower() + now = datetime.now(UTC) + + if low == "now": + return now + + m = _NOW_COMPACT_RE.match(stripped) + if m: + sign = 1 if m.group("sign") == "+" else -1 + n = int(m.group("n")) + unit = m.group("unit").lower() + delta = ( + sign + * { + "d": timedelta(days=n), + "h": timedelta(hours=n), + "m": timedelta(minutes=n), + }[unit] + ) + return now + delta + + m = _NOW_SPACED_RE.match(stripped) + if m: + sign = 1 if m.group("sign") == "+" else -1 + n = int(m.group("n")) + unit = m.group("unit").lower() + delta_map: dict[str, timedelta | relativedelta] = { + "second": timedelta(seconds=n), + "minute": timedelta(minutes=n), + "hour": timedelta(hours=n), + "day": timedelta(days=n), + "week": timedelta(weeks=n), + "month": relativedelta(months=n), + "year": relativedelta(years=n), + } + return now - delta_map[unit] if sign == -1 else now + delta_map[unit] + + return None + + +def _bound_datetimes( + field: str, + token: str, + tz: tzinfo, +) -> tuple[datetime, datetime] | None: + """ + Return (floor_dt, ceil_dt) UTC datetimes for a single range bound token, or + None if the token is unparsable. ``now`` and relative offsets resolve to the + current instant (floor == ceil == that instant; no day-flooring). + """ + token = token.strip() + + # Try relative/now forms first (before stripping hyphens which would mangle them). + rel = _resolve_relative_bound(token) + if rel is not None: + return rel, rel + + # Full ISO datetime token (contains "T"): parse directly and return an exact + # instant (floor == ceil). Python 3.11+ datetime.fromisoformat accepts trailing Z. + if "T" in token: + try: + dt = datetime.fromisoformat(token) + # Ensure timezone-aware UTC result. + dt = dt.replace(tzinfo=UTC) if dt.tzinfo is None else dt.astimezone(UTC) + return dt, dt + except ValueError: + return None + + digits = token.replace("-", "") + bounds = _precision_bounds(digits) + if bounds is None: + return None + start, end = bounds + return _utc_bounds_for_field(field, start, end, tz) + + +def _render(tok: Token, tz: tzinfo) -> str: + """Render a single token back to a Tantivy query string fragment.""" + if isinstance(tok, Passthrough): + return tok.raw + if isinstance(tok, Comma): + return " AND " + if isinstance(tok, FieldValueList): + field = FIELD_ALIASES.get(tok.field, tok.field) + return " AND ".join(f"{field}:{v}" for v in tok.values) + if isinstance(tok, FieldValue): + field = FIELD_ALIASES.get(tok.field, tok.field) + if field in DATE_FIELDS: + return translate_scalar(field, tok.value, tz) + return f"{field}:{tok.value}" + if isinstance(tok, FieldRange): + field = FIELD_ALIASES.get(tok.field, tok.field) + if field in DATE_FIELDS: + return translate_range(field, tok.lo, tok.hi, tz) + return f"{field}:{tok.open}{tok.lo} TO {tok.hi}{tok.close}" + return "" # pragma: no cover + + +# Post-render operator normalization patterns: collapse repeated whitespace and +# strip spaced/trailing Tantivy boolean operators that would otherwise be invalid. +_MULTI_SPACE_RE = regex.compile(r" {2,}") +_TRAILING_OP_RE = regex.compile(r"\s+[-+]+\s*$") +_SPACED_OP_RE = regex.compile(r"\s+[-+]\s+") + + +def _normalize_operators(text: str) -> str: + """ + Collapse multiple spaces, strip trailing dangling operators, and replace + spaced operators (`` - `` / `` + ``) with a single space. + + Applied only to Passthrough fragments (the rendered output is scanned for + operator artifacts outside bracketed ranges) via a post-render pass on the + full rendered string. This preserves date ranges (``[... TO ...]``) verbatim + while cleaning natural-language separators in the surrounding text. + """ + text = _MULTI_SPACE_RE.sub(" ", text) + text = _TRAILING_OP_RE.sub("", text).strip() + text = _SPACED_OP_RE.sub(" ", text).strip() + return text + + +def translate_query(raw: str, tz: tzinfo) -> str: + """Translate a raw Whoosh-style query into Tantivy-compatible syntax.""" + tokens = resolve_commas(scan(raw)) + rendered = "".join(_render(t, tz) for t in tokens) + return _normalize_operators(rendered) + + +def translate_range(field: str, lo: str, hi: str, tz: tzinfo) -> str: + """Translate a date-field ``[lo TO hi]`` range to a Tantivy ISO range string. + + Handles partial-date bounds (YYYY, YYYYMM, YYYYMMDD, ISO dash variants), + open bounds (empty string -> OPEN_LO/OPEN_HI), ``now``, and reversed ranges + (swaps tokens before computing floor/ceil so the span is always correct). + """ + lo_s = lo.strip() + hi_s = hi.strip() + + # Parse both bounds to (floor, ceil) pairs when present. + lo_pair: tuple[datetime, datetime] | None = None + hi_pair: tuple[datetime, datetime] | None = None + + if lo_s: + lo_pair = _bound_datetimes(field, lo_s, tz) + if lo_pair is None: + raise InvalidDateQuery(field, lo_s) + if hi_s: + hi_pair = _bound_datetimes(field, hi_s, tz) + if hi_pair is None: + raise InvalidDateQuery(field, hi_s) + + # Detect a reversed range: only swap when BOTH bounds are present. + if lo_pair is not None and hi_pair is not None and lo_pair[0] > hi_pair[0]: + lo_pair, hi_pair = hi_pair, lo_pair + + lo_iso = _fmt(lo_pair[0]) if lo_pair is not None else OPEN_LO + hi_iso = _fmt(hi_pair[1]) if hi_pair is not None else OPEN_HI + + return f"{field}:[{lo_iso} TO {hi_iso}]" diff --git a/src/documents/tests/search/conftest.py b/src/documents/tests/search/conftest.py index ccc26d695..93c560043 100644 --- a/src/documents/tests/search/conftest.py +++ b/src/documents/tests/search/conftest.py @@ -1,11 +1,15 @@ from __future__ import annotations +import tempfile from typing import TYPE_CHECKING import pytest +import tantivy from documents.search._backend import TantivyBackend from documents.search._backend import reset_backend +from documents.search._schema import build_schema +from documents.search._tokenizer import register_tokenizers if TYPE_CHECKING: from collections.abc import Generator @@ -31,3 +35,11 @@ def backend() -> Generator[TantivyBackend, None, None]: finally: b.close() reset_backend() + + +@pytest.fixture(scope="module") +def index() -> tantivy.Index: + """A real Tantivy index for parse-acceptance tests (module scope for speed).""" + idx = tantivy.Index(build_schema(), path=tempfile.mkdtemp()) + register_tokenizers(idx, "english") + return idx diff --git a/src/documents/tests/search/test_query.py b/src/documents/tests/search/test_query.py index 08d8ee260..60ba2bf2f 100644 --- a/src/documents/tests/search/test_query.py +++ b/src/documents/tests/search/test_query.py @@ -13,7 +13,6 @@ import time_machine from documents.search._query import _date_only_range from documents.search._query import _datetime_range -from documents.search._query import _rewrite_compact_date from documents.search._query import build_permission_filter from documents.search._query import normalize_query from documents.search._query import parse_simple_text_highlight_query @@ -21,6 +20,7 @@ from documents.search._query import parse_user_query from documents.search._query import rewrite_natural_date_keywords from documents.search._schema import build_schema from documents.search._tokenizer import register_tokenizers +from documents.search._translate import InvalidDateQuery if TYPE_CHECKING: from django.contrib.auth.base_user import AbstractBaseUser @@ -405,12 +405,14 @@ class TestWhooshQueryRewriting: assert lo == "2023-12-01T05:00:00Z" assert hi == "2023-12-02T05:00:00Z" - def test_8digit_invalid_date_passes_through_unchanged(self) -> None: - assert rewrite_natural_date_keywords("added:20231340", UTC) == "added:20231340" - - def test_compact_14digit_invalid_date_passes_through_unchanged(self) -> None: - # Month=13 makes datetime() raise ValueError; the token must be left as-is - assert _rewrite_compact_date("20231300120000") == "20231300120000" + def test_8digit_invalid_date_raises(self) -> None: + # The translation pipeline raises InvalidDateQuery for unparsable dates + # (e.g. month=13) so the API can surface a 400 telling the user the date + # is malformed instead of silently returning zero results. + with pytest.raises(InvalidDateQuery) as exc_info: + rewrite_natural_date_keywords("added:20231340", UTC) + assert exc_info.value.field == "added" + assert exc_info.value.value == "20231340" class TestParseUserQuery: @@ -463,6 +465,67 @@ class TestParseUserQuery: ) -> None: assert isinstance(parse_user_query(query_index, raw_query, UTC), tantivy.Query) + @pytest.mark.parametrize( + "raw_query", + [ + # Partial date scalar (year only) + pytest.param("created:2020", id="created_year_scalar"), + # 8-digit compact date range in brackets + pytest.param( + "created:[20200101 TO 20201231]", + id="created_8digit_bracket_range", + ), + # Comma-separated field + date range (Whoosh v2 multi-clause syntax) + pytest.param( + "title:x,created:[2020 TO 2021]", + id="title_comma_created_range", + ), + # Field alias: type -> document_type + pytest.param("type:invoice", id="type_alias"), + # Multi-word date keyword + pytest.param("created:previous week", id="created_previous_week"), + # Full ISO datetime range + pytest.param( + "created:[2026-01-01T00:00:00Z TO 2026-06-01T00:00:00Z]", + id="created_iso_range", + ), + # Comma-separated ISO ranges (Whoosh v2 syntax) + pytest.param( + "created:[2026-01-01T00:00:00Z TO 2026-06-01T00:00:00Z]," + "added:[2026-05-01T00:00:00Z TO 2026-06-01T00:00:00Z]", + id="comma_iso_ranges", + ), + ], + ) + def test_advanced_search_queries_do_not_raise( + self, + query_index: tantivy.Index, + raw_query: str, + ) -> None: + """ + End-to-end: queries that the frontend sends must parse without raising. + + This tests the full pipeline: translate_query -> tantivy parse_query. + Equivalent to asserting HTTP 200 (not 400) for each query form. + """ + with time_machine.travel(datetime(2026, 6, 15, 12, 0, tzinfo=UTC), tick=False): + assert isinstance( + parse_user_query(query_index, raw_query, UTC), + tantivy.Query, + ) + + def test_invalid_date_propagates_not_swallowed( + self, + query_index: tantivy.Index, + ) -> None: + # parse_user_query falls back to the raw query on unexpected translation + # errors, but an InvalidDateQuery is intentional and must propagate so the + # view can return a 400 instead of silently parsing the raw (invalid) date. + with pytest.raises(InvalidDateQuery) as exc_info: + parse_user_query(query_index, "created:202023", UTC) + assert exc_info.value.field == "created" + assert exc_info.value.value == "202023" + class TestYearRangeRewriting: """Whoosh-style year-only date ranges must be rewritten to ISO 8601.""" @@ -542,11 +605,16 @@ class TestYearRangeRewriting: assert rewrite_natural_date_keywords(original, UTC) == original def test_8digit_in_brackets_not_matched_as_year_range(self) -> None: - # [YYYYMMDD TO YYYYMMDD] has 8-digit values - must not be caught by year rewriter + # [YYYYMMDD TO YYYYMMDD]: the translation layer converts 8-digit bounds to + # ISO day ranges. 20200101 -> 2020-01-01T00:00:00Z (lo of that day); + # 20201231 -> the ceil of Dec 31 = 2021-01-01T00:00:00Z (exclusive end). + # This is the correct and accepted behavior: old compact form becomes a + # proper Tantivy-parseable ISO range. original = "created:[20200101 TO 20201231]" result = rewrite_natural_date_keywords(original, UTC) - assert "20200101" in result or "2020-01-01" in result - assert "20201231" in result or "2020-12-31" in result + lo, hi = _range(result, "created") + assert lo == "2020-01-01T00:00:00Z" + assert hi == "2021-01-01T00:00:00Z" class TestNonDateFieldsNotRewritten: @@ -606,6 +674,16 @@ class TestNormalizeQuery: def test_normalize_expands_comma_separated_tags(self) -> None: assert normalize_query("tag:foo,bar") == "tag:foo AND tag:bar" + def test_normalize_comma_between_range_expressions(self) -> None: + # Comma-separated field range expressions (Whoosh v2 syntax) must be + # converted to AND so Tantivy does not receive an invalid comma. + q = "created:[2026-01-01T00:00:00Z TO 2026-06-01T00:00:00Z],added:[2026-05-01T00:00:00Z TO 2026-06-01T00:00:00Z]" + assert normalize_query(q) == ( + "created:[2026-01-01T00:00:00Z TO 2026-06-01T00:00:00Z]" + " AND " + "added:[2026-05-01T00:00:00Z TO 2026-06-01T00:00:00Z]" + ) + def test_normalize_expands_three_values(self) -> None: assert normalize_query("tag:foo,bar,baz") == "tag:foo AND tag:bar AND tag:baz" diff --git a/src/documents/tests/search/test_translate.py b/src/documents/tests/search/test_translate.py new file mode 100644 index 000000000..d927a87bc --- /dev/null +++ b/src/documents/tests/search/test_translate.py @@ -0,0 +1,742 @@ +from __future__ import annotations + +from datetime import UTC +from datetime import datetime +from typing import TYPE_CHECKING +from zoneinfo import ZoneInfo + +import pytest +import time_machine + +from documents.search._dates import _precision_bounds + +if TYPE_CHECKING: + import tantivy +from documents.search._query import _FIELD_BOOSTS +from documents.search._query import DEFAULT_SEARCH_FIELDS +from documents.search._translate import OPEN_HI +from documents.search._translate import OPEN_LO +from documents.search._translate import Comma +from documents.search._translate import FieldRange +from documents.search._translate import FieldValue +from documents.search._translate import FieldValueList +from documents.search._translate import InvalidDateQuery +from documents.search._translate import Passthrough +from documents.search._translate import resolve_commas +from documents.search._translate import scan +from documents.search._translate import translate_query +from documents.search._translate import translate_range +from documents.search._translate import translate_scalar + + +@pytest.mark.search +class TestPrecisionBounds: + @pytest.mark.parametrize( + ("digits", "expected"), + [ + ("2020", ((2020, 1, 1), (2021, 1, 1))), + ("202003", ((2020, 3, 1), (2020, 4, 1))), + ("202012", ((2020, 12, 1), (2021, 1, 1))), + ("20200115", ((2020, 1, 15), (2020, 1, 16))), + ("20201231", ((2020, 12, 31), (2021, 1, 1))), + ], + ) + def test_valid(self, digits, expected): + lo, hi = _precision_bounds(digits) + assert (lo.year, lo.month, lo.day) == expected[0] + assert (hi.year, hi.month, hi.day) == expected[1] + + @pytest.mark.parametrize("digits", ["202023", "20200230", "20201301", "20", "abcd"]) + def test_invalid_returns_none(self, digits): + assert _precision_bounds(digits) is None + + +@pytest.mark.search +class TestScan: + def test_plain_words_are_passthrough(self): + assert scan("bank statement") == [Passthrough("bank statement")] + + def test_field_value(self): + assert scan("created:2020") == [FieldValue("created", "2020")] + + def test_field_value_in_boolean(self): + toks = scan("created:2020 OR foo") + assert toks == [ + FieldValue("created", "2020"), + Passthrough(" OR foo"), + ] + + def test_field_value_in_parens(self): + toks = scan("(created:2020 OR foo)") + assert toks == [ + Passthrough("("), + FieldValue("created", "2020"), + Passthrough(" OR foo)"), + ] + + def test_quoted_value(self): + assert scan('correspondent:"A B"') == [FieldValue("correspondent", '"A B"')] + + def test_field_range(self): + assert scan("created:[2020 TO 2021]") == [ + FieldRange("created", "[", "2020", "2021", "]"), + ] + + @pytest.mark.parametrize( + ("query", "expected"), + [ + pytest.param( + "created:[2020 to]", + FieldRange("created", "[", "2020", "", "]"), + id="open_upper", + ), + pytest.param( + "created:[to 2020]", + FieldRange("created", "[", "", "2020", "]"), + id="open_lower", + ), + ], + ) + def test_open_range(self, query, expected): + assert scan(query) == [expected] + + def test_comma_inside_range_not_split(self): + # No depth-0 comma here; the whole thing is one range token. + toks = scan("created:[2020 TO 2021]") + assert len(toks) == 1 + + # --- Edge-case / regression tests (scan must never raise) --- + + def test_url_is_passthrough(self): + # "http" is not a known field; the whole URL must pass through verbatim. + assert scan("http://example.com") == [Passthrough("http://example.com")] + + def test_unterminated_quote_is_passthrough(self): + # title is a known field but the quoted value has no closing quote; + # _consume_value returns None so the whole string falls into passthrough. + assert scan('title:"abc') == [Passthrough('title:"abc')] + + def test_unterminated_bracket_is_passthrough(self): + # created is a known field but the range bracket is never closed; + # _consume_range returns None so the whole string falls into passthrough. + assert scan("created:[2020") == [Passthrough("created:[2020")] + + def test_empty_value_at_end_is_passthrough(self): + # created is a known field but there is no value after the colon + # (_consume_value returns None for start >= n), so passthrough. + assert scan("created:") == [Passthrough("created:")] + + def test_value_containing_colon(self): + # The bare-word value reader stops at whitespace/paren, not at colon, + # so "2020:30" is consumed as a single value token. + assert scan("created:2020:30") == [FieldValue("created", "2020:30")] + + def test_comma_followed_by_unconsumable_value_stops(self): + # A comma followed by whitespace is neither a value-list continuation nor a + # clause separator: the value stops and the comma stays as passthrough. + assert scan("tag:foo, bar") == [ + FieldValue("tag", "foo"), + Passthrough(", bar"), + ] + + def test_bracket_without_to_is_open_upper_bound(self): + # A bracketed value with no TO falls back to (value, "") -> open upper bound. + assert scan("created:[2020]") == [ + FieldRange("created", "[", "2020", "", "]"), + ] + + def test_known_field_name_midword_is_passthrough(self): + # A known field name embedded mid-word is not a field token (the + # word-boundary guard); the whole run stays passthrough. + assert scan("xtag:foo") == [Passthrough("xtag:foo")] + + +@pytest.mark.search +class TestCommaResolution: + def test_value_list_multi_value_field(self): + toks = resolve_commas(scan("tag:foo,bar")) + assert toks == [FieldValueList("tag", ("foo", "bar"))] + + def test_value_list_three(self): + toks = resolve_commas(scan("tag_id:1,2,3")) + assert toks == [FieldValueList("tag_id", ("1", "2", "3"))] + + def test_text_field_comma_is_literal(self): + # correspondent is not multi-value: comma stays inside the value. + toks = resolve_commas(scan("correspondent:foo,bar")) + assert toks == [FieldValue("correspondent", "foo,bar")] + + def test_clause_separator_before_known_field(self): + toks = resolve_commas(scan("tag:foo,type:bar")) + assert toks == [FieldValue("tag", "foo"), Comma(), FieldValue("type", "bar")] + + def test_clause_separator_after_range(self): + toks = resolve_commas(scan("created:[2020 TO 2021],added:[2022 TO 2023]")) + assert toks == [ + FieldRange("created", "[", "2020", "2021", "]"), + Comma(), + FieldRange("added", "[", "2022", "2023", "]"), + ] + + def test_clause_separator_after_quote(self): + toks = resolve_commas(scan('correspondent:"A B",created:[2020 TO 2021]')) + assert toks == [ + FieldValue("correspondent", '"A B"'), + Comma(), + FieldRange("created", "[", "2020", "2021", "]"), + ] + + def test_url_comma_is_literal_passthrough(self): + toks = resolve_commas(scan("http://example.com/a,b")) + assert toks == [Passthrough("http://example.com/a,b")] + + def test_non_multi_value_comma_is_literal(self): + # title is not in MULTI_VALUE_FIELDS: comma stays inside the value. + toks = resolve_commas(scan("title:10,20")) + assert toks == [FieldValue("title", "10,20")] + + def test_clause_separator_before_known_date_field(self): + # The comma between a bare value and a known date field acts as a + # clause separator; both sides survive as distinct tokens. + toks = resolve_commas(scan("correspondent:foo,created:[2020 TO 2021]")) + assert toks == [ + FieldValue("correspondent", "foo"), + Comma(), + FieldRange("created", "[", "2020", "2021", "]"), + ] + + +@pytest.mark.search +class TestTranslateScalar: + @pytest.mark.parametrize( + ("field", "value", "expected"), + [ + ( + "created", + "2020", + "created:[2020-01-01T00:00:00Z TO 2021-01-01T00:00:00Z]", + ), + ( + "created", + "202003", + "created:[2020-03-01T00:00:00Z TO 2020-04-01T00:00:00Z]", + ), + ( + "created", + "20200115", + "created:[2020-01-15T00:00:00Z TO 2020-01-16T00:00:00Z]", + ), + ( + "created", + "2020-01-15", + "created:[2020-01-15T00:00:00Z TO 2020-01-16T00:00:00Z]", + ), + ( + "created", + "2020-03", + "created:[2020-03-01T00:00:00Z TO 2020-04-01T00:00:00Z]", + ), + ], + ) + def test_partial_and_iso_dates(self, field: str, value: str, expected: str) -> None: + assert translate_scalar(field, value, UTC) == expected + + def test_invalid_date_raises(self) -> None: + with pytest.raises(InvalidDateQuery) as exc_info: + translate_scalar("created", "202023", UTC) + assert exc_info.value.field == "created" + assert exc_info.value.value == "202023" + + def test_keyword_delegates(self) -> None: + # keyword path produces a range; just assert it is a created range + out = translate_scalar("created", "today", UTC) + assert out.startswith("created:[") and out.endswith("]") + + def test_14digit_compact_datetime(self) -> None: + out = translate_scalar("created", "20240115120000", UTC) + assert "20240115120000" not in out + assert out.startswith("created:") + assert out == "created:[2024-01-15T12:00:00Z TO 2024-01-15T12:00:00Z]" + + def test_14digit_invalid_month_raises(self) -> None: + with pytest.raises(InvalidDateQuery) as exc_info: + translate_scalar("created", "20231300120000", UTC) + assert exc_info.value.field == "created" + assert exc_info.value.value == "20231300120000" + + def test_unrecognized_value_raises(self) -> None: + # A value that is not a keyword, digits, ISO date, or compact timestamp + # raises rather than producing invalid Tantivy syntax or silently matching + # nothing. + with pytest.raises(InvalidDateQuery) as exc_info: + translate_scalar("created", "garbage", UTC) + assert exc_info.value.field == "created" + assert exc_info.value.value == "garbage" + + +@pytest.mark.search +class TestTranslateRange: + @pytest.mark.parametrize( + ("lo", "hi", "expected"), + [ + ("2005", "2009", "created:[2005-01-01T00:00:00Z TO 2010-01-01T00:00:00Z]"), + ( + "202001", + "202006", + "created:[2020-01-01T00:00:00Z TO 2020-07-01T00:00:00Z]", + ), + ( + "20200101", + "20201231", + "created:[2020-01-01T00:00:00Z TO 2021-01-01T00:00:00Z]", + ), + ( + "2020-01-01", + "2020-12-31", + "created:[2020-01-01T00:00:00Z TO 2021-01-01T00:00:00Z]", + ), + ], + ) + def test_absolute_ranges(self, lo, hi, expected): + assert translate_range("created", lo, hi, UTC) == expected + + def test_reversed_swaps(self): + assert translate_range("created", "2009", "2005", UTC) == ( + "created:[2005-01-01T00:00:00Z TO 2010-01-01T00:00:00Z]" + ) + + def test_open_upper(self): + out = translate_range("created", "2020", "", UTC) + assert out == f"created:[2020-01-01T00:00:00Z TO {OPEN_HI}]" + + def test_open_lower(self): + out = translate_range("created", "", "2020", UTC) + assert out == f"created:[{OPEN_LO} TO 2021-01-01T00:00:00Z]" + + def test_invalid_bound_raises(self): + with pytest.raises(InvalidDateQuery) as exc_info: + translate_range("created", "202023", "2025", UTC) + assert exc_info.value.field == "created" + assert exc_info.value.value == "202023" + + def test_invalid_high_bound_raises(self): + # Low bound parses, high bound does not -> raise on the high bound. + with pytest.raises(InvalidDateQuery) as exc_info: + translate_range("created", "2020", "garbage", UTC) + assert exc_info.value.field == "created" + assert exc_info.value.value == "garbage" + + +@pytest.mark.search +class TestTranslateQuery: + @pytest.mark.parametrize( + ("raw", "expected"), + [ + ( + "created:2020", + "created:[2020-01-01T00:00:00Z TO 2021-01-01T00:00:00Z]", + ), + ("tag:foo,bar", "tag:foo AND tag:bar"), + # 'type' is a user-facing alias rewritten to 'document_type' (the real schema field) + ("tag:foo,type:bar", "tag:foo AND document_type:bar"), + ( + "created:[2020 TO 2021],added:[2022 TO 2023]", + "created:[2020-01-01T00:00:00Z TO 2022-01-01T00:00:00Z]" + " AND " + "added:[2022-01-01T00:00:00Z TO 2024-01-01T00:00:00Z]", + ), + # correspondent is not multi-value: comma stays literal inside the value + ("correspondent:foo,bar", "correspondent:foo,bar"), + ], + ) + def test_golden(self, raw: str, expected: str) -> None: + assert translate_query(raw, UTC) == expected + + @pytest.mark.parametrize( + "raw", + [ + "created:2020", + "created:202003", + "created:[20200101 TO 20201231]", + "created:[2020-01-01 TO 2020-12-31]", + "created:[2020 to]", + "created:[to 2020]", + "title:x,created:[2020 TO 2021]", + "created:2020 OR foo", + "(created:2020 OR invoice)", + "tag:foo,type:bar", + "bank statement", + ], + ) + def test_parse_acceptance(self, index: tantivy.Index, raw: str) -> None: + translated = translate_query(raw, UTC) + # Must not raise: + index.parse_query(translated, DEFAULT_SEARCH_FIELDS, field_boosts=_FIELD_BOOSTS) + + +@pytest.mark.search +class TestFieldAliasing: + """Whoosh->Tantivy field-name aliasing (type/path -> document_type/storage_path).""" + + def test_type_alias(self) -> None: + assert translate_query("type:invoice", UTC) == "document_type:invoice" + + def test_path_alias(self) -> None: + assert translate_query("path:/foo/bar", UTC) == "storage_path:/foo/bar" + + def test_type_id_alias(self) -> None: + assert translate_query("type_id:5", UTC) == "document_type_id:5" + + def test_path_id_alias(self) -> None: + assert translate_query("path_id:7", UTC) == "storage_path_id:7" + + def test_clause_separator_plus_alias(self) -> None: + # Comma between known fields acts as AND separator; alias still applied. + assert ( + translate_query("tag:foo,type:bar", UTC) == "tag:foo AND document_type:bar" + ) + + def test_type_range_alias(self) -> None: + # type is not a date field; range passes through verbatim with alias applied. + assert ( + translate_query("type:[2020 TO 2021]", UTC) + == "document_type:[2020 TO 2021]" + ) + + def test_parse_acceptance_type(self, index: tantivy.Index) -> None: + # Translated output must be accepted by the real Tantivy parser. + translated = translate_query("type:invoice", UTC) + index.parse_query(translated, DEFAULT_SEARCH_FIELDS, field_boosts=_FIELD_BOOSTS) + + def test_parse_acceptance_path(self, index: tantivy.Index) -> None: + translated = translate_query("path:foo", UTC) + index.parse_query(translated, DEFAULT_SEARCH_FIELDS, field_boosts=_FIELD_BOOSTS) + + +# Freeze time so relative-date tests are deterministic. +_FROZEN_NOW = datetime(2026, 3, 28, 12, 0, 0, tzinfo=UTC) + + +@pytest.mark.search +class TestRelativeRanges: + """Relative date-range tokens resolved against a frozen clock.""" + + @time_machine.travel(_FROZEN_NOW, tick=False) + def test_minus_7_days_to_now(self) -> None: + assert translate_query("added:[-7 days to now]", UTC) == ( + "added:[2026-03-21T12:00:00Z TO 2026-03-28T12:00:00Z]" + ) + + @time_machine.travel(_FROZEN_NOW, tick=False) + def test_minus_1_week_to_now(self) -> None: + assert translate_query("added:[-1 week to now]", UTC) == ( + "added:[2026-03-21T12:00:00Z TO 2026-03-28T12:00:00Z]" + ) + + @time_machine.travel(_FROZEN_NOW, tick=False) + def test_minus_1_month_to_now(self) -> None: + assert translate_query("created:[-1 month to now]", UTC) == ( + "created:[2026-02-28T12:00:00Z TO 2026-03-28T12:00:00Z]" + ) + + @time_machine.travel(_FROZEN_NOW, tick=False) + def test_minus_1_year_to_now(self) -> None: + assert translate_query("modified:[-1 year to now]", UTC) == ( + "modified:[2025-03-28T12:00:00Z TO 2026-03-28T12:00:00Z]" + ) + + @time_machine.travel(_FROZEN_NOW, tick=False) + def test_minus_3_hours_to_now(self) -> None: + assert translate_query("added:[-3 hours to now]", UTC) == ( + "added:[2026-03-28T09:00:00Z TO 2026-03-28T12:00:00Z]" + ) + + @time_machine.travel(_FROZEN_NOW, tick=False) + def test_uppercase_units(self) -> None: + assert translate_query("added:[-1 WEEK TO NOW]", UTC) == ( + "added:[2026-03-21T12:00:00Z TO 2026-03-28T12:00:00Z]" + ) + + @time_machine.travel(_FROZEN_NOW, tick=False) + def test_now_minus_7d_compact(self) -> None: + assert translate_query("added:[now-7d TO now]", UTC) == ( + "added:[2026-03-21T12:00:00Z TO 2026-03-28T12:00:00Z]" + ) + + @time_machine.travel(_FROZEN_NOW, tick=False) + def test_reversed_range_swapped(self) -> None: + # now+1h TO now-1h is reversed; translate_range swaps -> lo=now-1h, hi=now+1h + assert translate_query("added:[now+1h TO now-1h]", UTC) == ( + "added:[2026-03-28T11:00:00Z TO 2026-03-28T13:00:00Z]" + ) + + @pytest.mark.parametrize( + "raw", + [ + "added:[-7 days to now]", + "added:[-1 week to now]", + "created:[-1 month to now]", + "modified:[-1 year to now]", + "added:[-3 hours to now]", + "added:[now-7d TO now]", + "added:[now+1h TO now-1h]", + ], + ) + @time_machine.travel(_FROZEN_NOW, tick=False) + def test_parse_acceptance(self, index: tantivy.Index, raw: str) -> None: + translated = translate_query(raw, UTC) + index.parse_query(translated, DEFAULT_SEARCH_FIELDS, field_boosts=_FIELD_BOOSTS) + + +@pytest.mark.search +class TestOperatorNormalization: + """Post-render operator normalization in translate_query.""" + + def test_spaced_dash_removed(self) -> None: + assert ( + translate_query("H52.1 - Kurzsichtigkeit", UTC) == "H52.1 Kurzsichtigkeit" + ) + + def test_spaced_dash_simple(self) -> None: + assert translate_query("bar - baz", UTC) == "bar baz" + + def test_trailing_operator_stripped(self) -> None: + assert translate_query("foo -", UTC) == "foo" + + def test_date_range_preserved(self) -> None: + out = translate_query("created:[2020 TO 2021]", UTC) + # Must not corrupt the ISO range + assert out == "created:[2020-01-01T00:00:00Z TO 2022-01-01T00:00:00Z]" + + def test_date_scalar_with_or(self) -> None: + out = translate_query("created:2020 OR foo", UTC) + # The created scalar becomes a range; " OR foo" passes through verbatim. + assert out.startswith("created:[") + assert "OR foo" in out + + def test_parse_acceptance_spaced_dash(self, index: tantivy.Index) -> None: + translated = translate_query("H52.1 - Kurzsichtigkeit", UTC) + index.parse_query(translated, DEFAULT_SEARCH_FIELDS, field_boosts=_FIELD_BOOSTS) + + def test_parse_acceptance_trailing_op(self, index: tantivy.Index) -> None: + translated = translate_query("foo -", UTC) + index.parse_query(translated, DEFAULT_SEARCH_FIELDS, field_boosts=_FIELD_BOOSTS) + + +@pytest.mark.search +class TestMultiWordDateKeywords: + """scan() must consume multi-word date keywords as a single value.""" + + def test_scan_previous_week_as_single_token(self) -> None: + # "created:previous week" must produce one FieldValue with value "previous week", + # not FieldValue("created","previous") + Passthrough(" week"). + toks = scan("created:previous week") + assert toks == [FieldValue("created", "previous week")] + + def test_scan_this_month_as_single_token(self) -> None: + toks = scan("added:this month") + assert toks == [FieldValue("added", "this month")] + + def test_scan_previous_month_as_single_token(self) -> None: + toks = scan("created:previous month") + assert toks == [FieldValue("created", "previous month")] + + def test_scan_this_year_as_single_token(self) -> None: + toks = scan("added:this year") + assert toks == [FieldValue("added", "this year")] + + def test_scan_previous_year_as_single_token(self) -> None: + toks = scan("created:previous year") + assert toks == [FieldValue("created", "previous year")] + + def test_scan_previous_quarter_as_single_token(self) -> None: + toks = scan("created:previous quarter") + assert toks == [FieldValue("created", "previous quarter")] + + def test_quoted_multi_word_keyword_still_works(self) -> None: + # The quoted form must continue to work as before. + toks = scan('created:"previous week"') + assert toks == [FieldValue("created", '"previous week"')] + + def test_non_date_field_not_affected(self) -> None: + # "previous" stops at the space for non-date fields; " week" passes through. + toks = scan("correspondent:previous week") + assert toks == [ + FieldValue("correspondent", "previous"), + Passthrough(" week"), + ] + + +@pytest.mark.search +class TestKeywordDateResolution: + """Relative date keywords resolve to exact ISO ranges against a frozen clock. + + Frozen at 2026-03-28 12:00 UTC (a Saturday in Q1) so the week, month, + quarter and year rollovers are all exercised by a single anchor. + """ + + # created is a DateField: bounds are UTC midnight, no timezone offset. + @pytest.mark.parametrize( + ("keyword", "expected"), + [ + pytest.param( + "today", + "created:[2026-03-28T00:00:00Z TO 2026-03-29T00:00:00Z]", + id="today", + ), + pytest.param( + "yesterday", + "created:[2026-03-27T00:00:00Z TO 2026-03-28T00:00:00Z]", + id="yesterday", + ), + pytest.param( + "previous week", + "created:[2026-03-16T00:00:00Z TO 2026-03-23T00:00:00Z]", + id="previous-week", + ), + pytest.param( + "this month", + "created:[2026-03-01T00:00:00Z TO 2026-04-01T00:00:00Z]", + id="this-month", + ), + pytest.param( + "previous month", + "created:[2026-02-01T00:00:00Z TO 2026-03-01T00:00:00Z]", + id="previous-month", + ), + pytest.param( + "this year", + "created:[2026-01-01T00:00:00Z TO 2027-01-01T00:00:00Z]", + id="this-year", + ), + pytest.param( + "previous year", + "created:[2025-01-01T00:00:00Z TO 2026-01-01T00:00:00Z]", + id="previous-year", + ), + pytest.param( + "previous quarter", + "created:[2025-10-01T00:00:00Z TO 2026-01-01T00:00:00Z]", + id="previous-quarter", + ), + ], + ) + @time_machine.travel(_FROZEN_NOW, tick=False) + def test_date_only_field_keyword_ranges( + self, + keyword: str, + expected: str, + ) -> None: + assert translate_query(f"created:{keyword}", UTC) == expected + + # added is a DateTimeField: local-tz midnight converted to UTC. Tokyo + # (+09:00, no DST) shifts each midnight boundary back to 15:00Z the day + # before, so this also exercises the local-midnight offset path. + @pytest.mark.parametrize( + ("keyword", "expected"), + [ + pytest.param( + "today", + "added:[2026-03-27T15:00:00Z TO 2026-03-28T15:00:00Z]", + id="today", + ), + pytest.param( + "yesterday", + "added:[2026-03-26T15:00:00Z TO 2026-03-27T15:00:00Z]", + id="yesterday", + ), + pytest.param( + "previous week", + "added:[2026-03-15T15:00:00Z TO 2026-03-22T15:00:00Z]", + id="previous-week", + ), + pytest.param( + "this month", + "added:[2026-02-28T15:00:00Z TO 2026-03-31T15:00:00Z]", + id="this-month", + ), + pytest.param( + "previous month", + "added:[2026-01-31T15:00:00Z TO 2026-02-28T15:00:00Z]", + id="previous-month", + ), + pytest.param( + "this year", + "added:[2025-12-31T15:00:00Z TO 2026-12-31T15:00:00Z]", + id="this-year", + ), + pytest.param( + "previous year", + "added:[2024-12-31T15:00:00Z TO 2025-12-31T15:00:00Z]", + id="previous-year", + ), + pytest.param( + "previous quarter", + "added:[2025-09-30T15:00:00Z TO 2025-12-31T15:00:00Z]", + id="previous-quarter", + ), + ], + ) + @time_machine.travel(_FROZEN_NOW, tick=False) + def test_datetime_field_keyword_ranges_local_tz( + self, + keyword: str, + expected: str, + ) -> None: + assert translate_query(f"added:{keyword}", ZoneInfo("Asia/Tokyo")) == expected + + +@pytest.mark.search +class TestISODatetimeBounds: + """Full ISO datetime tokens in range bounds must be parsed directly.""" + + def test_translate_range_iso_bounds_passthrough(self) -> None: + # Already-ISO datetime bounds must pass through as-is (exact instant). + result = translate_range( + "created", + "2020-01-01T00:00:00Z", + "2021-01-01T00:00:00Z", + UTC, + ) + assert result == "created:[2020-01-01T00:00:00Z TO 2021-01-01T00:00:00Z]" + + def test_translate_query_iso_range_preserved(self) -> None: + q = "created:[2026-01-01T00:00:00Z TO 2026-06-01T00:00:00Z]" + assert translate_query(q, UTC) == q + + def test_translate_query_comma_separated_iso_ranges(self) -> None: + q = ( + "created:[2026-01-01T00:00:00Z TO 2026-06-01T00:00:00Z]," + "added:[2026-05-01T00:00:00Z TO 2026-06-01T00:00:00Z]" + ) + result = translate_query(q, UTC) + assert result == ( + "created:[2026-01-01T00:00:00Z TO 2026-06-01T00:00:00Z]" + " AND " + "added:[2026-05-01T00:00:00Z TO 2026-06-01T00:00:00Z]" + ) + + def test_invalid_iso_datetime_raises(self) -> None: + # A token with "T" that is not valid ISO datetime -> raise. + with pytest.raises(InvalidDateQuery) as exc_info: + translate_range( + "created", + "2020-01-01T99:00:00Z", + "2021-01-01T00:00:00Z", + UTC, + ) + assert exc_info.value.field == "created" + assert exc_info.value.value == "2020-01-01T99:00:00Z" + + def test_parse_acceptance_iso_bounds(self, index: tantivy.Index) -> None: + q = "created:[2026-01-01T00:00:00Z TO 2026-06-01T00:00:00Z]" + translated = translate_query(q, UTC) + index.parse_query(translated, DEFAULT_SEARCH_FIELDS, field_boosts=_FIELD_BOOSTS) + + def test_parse_acceptance_comma_iso_ranges(self, index: tantivy.Index) -> None: + q = ( + "created:[2026-01-01T00:00:00Z TO 2026-06-01T00:00:00Z]," + "added:[2026-05-01T00:00:00Z TO 2026-06-01T00:00:00Z]" + ) + translated = translate_query(q, UTC) + index.parse_query(translated, DEFAULT_SEARCH_FIELDS, field_boosts=_FIELD_BOOSTS) diff --git a/src/documents/tests/test_api_search.py b/src/documents/tests/test_api_search.py index ad82e1a8a..c25e83e13 100644 --- a/src/documents/tests/test_api_search.py +++ b/src/documents/tests/test_api_search.py @@ -725,9 +725,11 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase): GIVEN: - One document added right now WHEN: - - Query with invalid added date + - Query with an invalid added date THEN: - - 400 Bad Request returned (Tantivy rejects invalid date field syntax) + - 400 Bad Request with a message naming the malformed date, so the + user knows their date is invalid rather than silently getting zero + results """ d1 = Document.objects.create( title="invoice", @@ -740,8 +742,9 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase): response = self.client.get("/api/documents/?query=added:invalid-date") - # Tantivy rejects unparsable field queries with a 400 + # An unparsable date is reported as a malformed query, not silently empty. self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) + self.assertIn("invalid-date", str(response.data["query"])) @override_settings( TIME_ZONE="UTC", diff --git a/src/documents/views.py b/src/documents/views.py index 8979113f7..1bc459211 100644 --- a/src/documents/views.py +++ b/src/documents/views.py @@ -2277,6 +2277,7 @@ class UnifiedSearchViewSet(DocumentViewSet): return super().list(request) from documents.search import SearchHit + from documents.search import SearchQueryError from documents.search import TantivyBackend from documents.search import TantivyRelevanceList from documents.search import get_backend @@ -2469,6 +2470,11 @@ class UnifiedSearchViewSet(DocumentViewSet): return HttpResponseForbidden(_("Insufficient permissions.")) except ValidationError: raise + except SearchQueryError as e: + # User-fixable query error (e.g. an unparsable date): surface the + # specific message so the user can correct it, rather than a generic + # 400 or silently empty results. + raise ValidationError({"query": [str(e)]}) from e except Exception as e: logger.warning(f"An error occurred listing search results: {e!s}") return HttpResponseBadRequest(