from __future__ import annotations from dataclasses import dataclass from datetime import UTC from datetime import datetime from datetime import timedelta from typing import TYPE_CHECKING from typing import TypeAlias import regex from dateutil.relativedelta import relativedelta from documents.search._dates import _DATE_KEYWORDS from documents.search._dates import _DATE_ONLY_FIELDS from documents.search._dates import _date_only_range from documents.search._dates import _datetime_range from documents.search._dates import _field_range_from_dates from documents.search._dates import _fmt from documents.search._dates import _precision_bounds from documents.search._dates import _utc_bounds_for_field # Compiled regex that matches any known multi-word (or single-word) date keyword # at the start of a match position, longest alternatives first so "previous week" # wins over a hypothetical shorter "previous". _KEYWORD_VALUE_RE = regex.compile( "|".join(sorted((regex.escape(k) for k in _DATE_KEYWORDS), key=len, reverse=True)), regex.IGNORECASE, ) if TYPE_CHECKING: from datetime import tzinfo # TODO: this module translates date queries into Tantivy *string* syntax, which # forces a workaround for something Tantivy's string parser cannot express on # date fields: open-ended ranges use far-past/far-future string sentinels # (OPEN_LO/OPEN_HI). These can be replaced with a real tantivy.Query object # (Query.range_query(..., None) for open bounds) once tantivy-py accepts Python # datetimes in range_query/term_query on Date fields. That support exists on # tantivy-py master (PRs #655 + #666) but postdates the pinned 0.26.0 wheel, so # it is blocked only on a published release > 0.26.0 and a dependency bump. # (Unparsable dates now raise InvalidDateQuery -> HTTP 400 rather than using a # no-match string sentinel.) # Fields that store exact, non-analyzed comma-joined tokens in the index and so # need explicit comma->AND expansion (Whoosh KEYWORD(commas=True) set). MULTI_VALUE_FIELDS = frozenset({"tag", "tag_id", "viewer_id"}) # Date fields whose values/ranges get rewritten to RFC3339 Tantivy ranges. DATE_FIELDS = frozenset({"created", "modified", "added"}) # Field aliases: Whoosh (v2) field names that were renamed in the Tantivy schema. # Preserved here so v2 queries using the old names continue to work without 400 # errors instead of silently failing. Applied by _render to non-date field tokens. FIELD_ALIASES: dict[str, str] = { "type": "document_type", "type_id": "document_type_id", "path": "storage_path", "path_id": "storage_path_id", } # Known schema fields: a comma immediately followed by ``:`` is a clause # separator. Restricting to known fields prevents URL-like ``http:`` misfires. KNOWN_FIELDS = frozenset( { "title", "content", "correspondent", "document_type", "type", # v2 alias -> document_type "storage_path", "path", # v2 alias -> storage_path "tag", "tag_id", "correspondent_id", "document_type_id", "type_id", # v2 alias -> document_type_id "storage_path_id", "path_id", # v2 alias -> storage_path_id "owner_id", "viewer_id", "asn", "page_count", "num_notes", "created", "modified", "added", "original_filename", "checksum", "notes", "custom_fields", }, ) _FIELD_RE = regex.compile(r"(?P\w+):") # Matches the TO separator inside a range bracket. Handles three forms: # middle: "lo TO hi" (either lo or hi may be empty) # trailing: "lo TO" (open upper bound) # leading: "TO hi" (open lower bound) # Bounds MAY contain internal spaces (e.g. "-7 days"), so we use .*? / .+? # and split on the whitespace-delimited " TO " / " to " separator. _RANGE_RE = regex.compile( r"^\s*(?P.*?)\s+[Tt][Oo]\s+(?P.+?)\s*$" r"|" r"^\s*(?P.+?)\s+[Tt][Oo]\s*$" r"|" r"^\s*[Tt][Oo]\s+(?P.+?)\s*$", ) @dataclass(frozen=True, slots=True) class FieldValue: field: str value: str # Produced by the comma-resolution pass (not by scan()). @dataclass(frozen=True, slots=True) class FieldValueList: field: str values: tuple[str, ...] @dataclass(frozen=True, slots=True) class FieldRange: field: str open: str lo: str hi: str close: str # Produced by the comma-resolution pass (not by scan()). @dataclass(frozen=True, slots=True) class Comma: pass @dataclass(frozen=True, slots=True) class Passthrough: raw: str Token: TypeAlias = FieldValue | FieldValueList | FieldRange | Comma | Passthrough _CLOSE: dict[str, str] = {"[": "]", "{": "}"} def scan(query: str) -> list[Token]: """ Tokenize a raw query into date/comma-aware tokens, leaving everything else as verbatim ``Passthrough`` runs. Non-recursive: finds the first matching close bracket/quote. Nested brackets are not valid Tantivy range syntax and pass through verbatim on mismatch. """ tokens: list[Token] = [] buf: list[str] = [] # accumulates passthrough chars i, n = 0, len(query) while i < n: matched = _match_field_token(query, i) if matched is None: buf.append(query[i]) i += 1 continue token, i = matched _flush(buf, tokens) tokens.append(token) i = _maybe_comma(query, i, tokens) _flush(buf, tokens) return tokens def _flush(buf: list[str], tokens: list[Token]) -> None: """Emit any accumulated passthrough characters as a single token.""" if buf: tokens.append(Passthrough("".join(buf))) buf.clear() def _at_word_boundary(query: str, i: int) -> bool: """A field token may begin only at the start or after a non-word character.""" return i == 0 or not (query[i - 1].isalnum() or query[i - 1] == "_") def _match_field_token(query: str, i: int) -> tuple[Token, int] | None: """ If a known ``field:`` token starts at ``i``, consume it and return ``(token, end_index)``; otherwise return None so the caller treats the character as passthrough. Handles both ``field:[range]`` and ``field:value``, and returns None when the range/value cannot be consumed. """ m = _FIELD_RE.match(query, i) if m is None or m.group("field") not in KNOWN_FIELDS: return None if not _at_word_boundary(query, i): return None field = m.group("field") j = m.end() if j < len(query) and query[j] in "[{": return _consume_range(query, j, field) consumed = _consume_field_value(query, field, j) if consumed is None: return None value, end = consumed return FieldValue(field, value), end def _consume_field_value(query: str, field: str, start: int) -> tuple[str, int] | None: """ Consume a field value starting at ``start``: a multi-word date keyword phrase (date fields only), or a bare/quoted value, then absorb any comma-joined continuation that is not a clause separator. ``resolve_commas`` later splits a multi-value field's joined value into a ``FieldValueList``; for other fields the comma stays literal. """ n = len(query) consumed = None if field in DATE_FIELDS: km = _KEYWORD_VALUE_RE.match(query, start) if km is not None and (km.end() >= n or query[km.end()] in " \t),"): consumed = (km.group(0), km.end()) if consumed is None: consumed = _consume_value(query, start) if consumed is None: return None value, k = consumed while k < n and query[k] == ",": if _looks_like_known_field(query, k + 1): break # clause separator: left for _maybe_comma to emit a Comma() more = _consume_value(query, k + 1) if more is None: break value = f"{value},{more[0]}" k = more[1] return value, k def _consume_range( query: str, start: int, field: str, ) -> tuple[FieldRange, int] | None: """Consume ``[lo TO hi]`` / ``{lo TO hi}`` from ``start`` (the bracket).""" open_br = query[start] close_br = _CLOSE[open_br] end = query.find(close_br, start + 1) if end == -1: return None inner = query[start + 1 : end] m = _RANGE_RE.match(inner) if m is not None: if m.group("lo") is not None or m.group("hi") is not None: # Middle form: "lo TO hi" (either may be empty string) lo = (m.group("lo") or "").strip() hi = (m.group("hi") or "").strip() elif m.group("lo2") is not None: # Trailing form: "lo TO" lo = m.group("lo2").strip() hi = "" else: # Leading form: "TO hi" lo = "" hi = (m.group("hi2") or "").strip() else: lo, hi = inner.strip(), "" return FieldRange(field, open_br, lo, hi, close_br), end + 1 def _consume_value(query: str, start: int) -> tuple[str, int] | None: """Consume a bare or quoted field value from ``start``, stopping at comma.""" n = len(query) if start >= n or query[start] in " \t": return None if query[start] in "\"'": quote = query[start] end = query.find(quote, start + 1) if end == -1: return None return query[start : end + 1], end + 1 j = start while j < n and query[j] not in " \t),": j += 1 return query[start:j], j def _looks_like_known_field(query: str, pos: int) -> bool: """True if a known ``field:`` token starts at ``pos``.""" m = _FIELD_RE.match(query, pos) return bool(m and m.group("field") in KNOWN_FIELDS) def _maybe_comma(query: str, i: int, tokens: list) -> int: """If a clause-separator comma follows at ``i``, emit ``Comma()`` and advance.""" if i < len(query) and query[i] == "," and _looks_like_known_field(query, i + 1): tokens.append(Comma()) return i + 1 return i def resolve_commas(tokens: list) -> list: """ Collapse value-list commas into ``FieldValueList`` and keep clause-separator commas as ``Comma``. (Clause-sep commas are already emitted by ``scan`` via the value-stop logic; this pass folds value-lists.) """ out: list = [] for tok in tokens: if ( isinstance(tok, FieldValue) and tok.field in MULTI_VALUE_FIELDS and "," in tok.value ): values = tuple(v for v in tok.value.split(",") if v) out.append(FieldValueList(tok.field, values)) else: out.append(tok) return out class SearchQueryError(ValueError): """ Base for user-fixable search query errors. Carries a message safe to surface to the user (no internal details). The view layer catches this and returns an HTTP 400, so any future subclass (unknown field, malformed range, wrapped parser errors) gets the same treatment. """ class InvalidDateQuery(SearchQueryError): """Raised when a date field value or range bound cannot be parsed.""" def __init__(self, field: str, value: str) -> None: self.field = field self.value = value super().__init__(f"Invalid date value {value!r} for field {field!r}.") _DIGITS_RE = regex.compile(r"^\d{4}(?:\d{2}){0,2}$") _ISO_RE = regex.compile(r"^\d{4}(?:-\d{2}(?:-\d{2})?)?$") def translate_scalar(field: str, value: str, tz: tzinfo) -> str: """Translate a bare date-field value to a Tantivy range string.""" bare = value.strip("\"'").lower() if bare in _DATE_KEYWORDS: if field in _DATE_ONLY_FIELDS: return f"{field}:{_date_only_range(bare, tz)}" return f"{field}:{_datetime_range(bare, tz)}" digits = value.replace("-", "") if _DIGITS_RE.match(value) or _ISO_RE.match(value): bounds = _precision_bounds(digits) if bounds is None: raise InvalidDateQuery(field, value) return _field_range_from_dates(field, bounds[0], bounds[1], tz) if regex.fullmatch(r"\d{14}", value): try: dt = datetime( int(value[0:4]), int(value[4:6]), int(value[6:8]), int(value[8:10]), int(value[10:12]), int(value[12:14]), tzinfo=UTC, ) except ValueError: raise InvalidDateQuery(field, value) from None iso = _fmt(dt) return f"{field}:[{iso} TO {iso}]" # Unrecognized shape -> tell the user their date is malformed rather than # silently matching nothing or emitting invalid Tantivy syntax. raise InvalidDateQuery(field, value) # Open-bound sentinels for date ranges. These far-past/far-future strings allow # open-ended ranges to be expressed as Tantivy string queries until tantivy-py # exposes Query.range_query(..., None) on Date fields (see module TODO). OPEN_LO = "0001-01-01T00:00:00Z" OPEN_HI = "9999-12-31T23:59:59Z" # Matches compact now-offset tokens like now-7d, now+1h, now-30m. _NOW_COMPACT_RE = regex.compile( r"^now(?P[+-])(?P\d+)(?P[dhm])$", regex.IGNORECASE, ) # Matches "±N " Whoosh-style offsets (e.g. -7 days, -1 week, +3 hours) # Unit is singular or plural; sign prefix is mandatory. _NOW_SPACED_RE = regex.compile( r"^(?P[+-])(?P\d+)\s*" r"(?Psecond|minute|hour|day|week|month|year)s?$", regex.IGNORECASE, ) def _resolve_relative_bound(token: str) -> datetime | None: """ Resolve a relative bound token to an exact UTC instant, or return None. Supported forms: - ``now`` -> current UTC instant - ``now+/-d/h/m`` -> now +/- timedelta (d=days, h=hours, m=minutes) - ``±N `` -> now +/- delta; month/year use relativedelta """ stripped = token.strip() low = stripped.lower() now = datetime.now(UTC) if low == "now": return now m = _NOW_COMPACT_RE.match(stripped) if m: sign = 1 if m.group("sign") == "+" else -1 n = int(m.group("n")) unit = m.group("unit").lower() delta = ( sign * { "d": timedelta(days=n), "h": timedelta(hours=n), "m": timedelta(minutes=n), }[unit] ) return now + delta m = _NOW_SPACED_RE.match(stripped) if m: sign = 1 if m.group("sign") == "+" else -1 n = int(m.group("n")) unit = m.group("unit").lower() delta_map: dict[str, timedelta | relativedelta] = { "second": timedelta(seconds=n), "minute": timedelta(minutes=n), "hour": timedelta(hours=n), "day": timedelta(days=n), "week": timedelta(weeks=n), "month": relativedelta(months=n), "year": relativedelta(years=n), } return now - delta_map[unit] if sign == -1 else now + delta_map[unit] return None def _bound_datetimes( field: str, token: str, tz: tzinfo, ) -> tuple[datetime, datetime] | None: """ Return (floor_dt, ceil_dt) UTC datetimes for a single range bound token, or None if the token is unparsable. ``now`` and relative offsets resolve to the current instant (floor == ceil == that instant; no day-flooring). """ token = token.strip() # Try relative/now forms first (before stripping hyphens which would mangle them). rel = _resolve_relative_bound(token) if rel is not None: return rel, rel # Full ISO datetime token (contains "T"): parse directly and return an exact # instant (floor == ceil). Python 3.11+ datetime.fromisoformat accepts trailing Z. if "T" in token: try: dt = datetime.fromisoformat(token) # Ensure timezone-aware UTC result. dt = dt.replace(tzinfo=UTC) if dt.tzinfo is None else dt.astimezone(UTC) return dt, dt except ValueError: return None digits = token.replace("-", "") bounds = _precision_bounds(digits) if bounds is None: return None start, end = bounds return _utc_bounds_for_field(field, start, end, tz) def _render(tok: Token, tz: tzinfo) -> str: """Render a single token back to a Tantivy query string fragment.""" if isinstance(tok, Passthrough): return tok.raw if isinstance(tok, Comma): return " AND " if isinstance(tok, FieldValueList): field = FIELD_ALIASES.get(tok.field, tok.field) return " AND ".join(f"{field}:{v}" for v in tok.values) if isinstance(tok, FieldValue): field = FIELD_ALIASES.get(tok.field, tok.field) if field in DATE_FIELDS: return translate_scalar(field, tok.value, tz) return f"{field}:{tok.value}" if isinstance(tok, FieldRange): field = FIELD_ALIASES.get(tok.field, tok.field) if field in DATE_FIELDS: return translate_range(field, tok.lo, tok.hi, tz) return f"{field}:{tok.open}{tok.lo} TO {tok.hi}{tok.close}" return "" # pragma: no cover # Post-render operator normalization patterns: collapse repeated whitespace and # strip spaced/trailing Tantivy boolean operators that would otherwise be invalid. _MULTI_SPACE_RE = regex.compile(r" {2,}") _TRAILING_OP_RE = regex.compile(r"\s+[-+]+\s*$") _SPACED_OP_RE = regex.compile(r"\s+[-+]\s+") def _normalize_operators(text: str) -> str: """ Collapse multiple spaces, strip trailing dangling operators, and replace spaced operators (`` - `` / `` + ``) with a single space. Applied only to Passthrough fragments (the rendered output is scanned for operator artifacts outside bracketed ranges) via a post-render pass on the full rendered string. This preserves date ranges (``[... TO ...]``) verbatim while cleaning natural-language separators in the surrounding text. """ text = _MULTI_SPACE_RE.sub(" ", text) text = _TRAILING_OP_RE.sub("", text).strip() text = _SPACED_OP_RE.sub(" ", text).strip() return text def translate_query(raw: str, tz: tzinfo) -> str: """Translate a raw Whoosh-style query into Tantivy-compatible syntax.""" tokens = resolve_commas(scan(raw)) rendered = "".join(_render(t, tz) for t in tokens) return _normalize_operators(rendered) def translate_range(field: str, lo: str, hi: str, tz: tzinfo) -> str: """Translate a date-field ``[lo TO hi]`` range to a Tantivy ISO range string. Handles partial-date bounds (YYYY, YYYYMM, YYYYMMDD, ISO dash variants), open bounds (empty string -> OPEN_LO/OPEN_HI), ``now``, and reversed ranges (swaps tokens before computing floor/ceil so the span is always correct). """ lo_s = lo.strip() hi_s = hi.strip() # Parse both bounds to (floor, ceil) pairs when present. lo_pair: tuple[datetime, datetime] | None = None hi_pair: tuple[datetime, datetime] | None = None if lo_s: lo_pair = _bound_datetimes(field, lo_s, tz) if lo_pair is None: raise InvalidDateQuery(field, lo_s) if hi_s: hi_pair = _bound_datetimes(field, hi_s, tz) if hi_pair is None: raise InvalidDateQuery(field, hi_s) # Detect a reversed range: only swap when BOTH bounds are present. if lo_pair is not None and hi_pair is not None and lo_pair[0] > hi_pair[0]: lo_pair, hi_pair = hi_pair, lo_pair lo_iso = _fmt(lo_pair[0]) if lo_pair is not None else OPEN_LO hi_iso = _fmt(hi_pair[1]) if hi_pair is not None else OPEN_HI return f"{field}:[{lo_iso} TO {hi_iso}]"