mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-06-17 19:04:18 +00:00
Fix (beta): restore v2 (Whoosh) advanced-search query compatibility (#13010)
Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -8,11 +8,15 @@ from documents.search._backend import get_backend
|
||||
from documents.search._backend import reset_backend
|
||||
from documents.search._schema import needs_rebuild
|
||||
from documents.search._schema import wipe_index
|
||||
from documents.search._translate import InvalidDateQuery
|
||||
from documents.search._translate import SearchQueryError
|
||||
|
||||
__all__ = [
|
||||
"InvalidDateQuery",
|
||||
"SearchHit",
|
||||
"SearchIndexLockError",
|
||||
"SearchMode",
|
||||
"SearchQueryError",
|
||||
"TantivyBackend",
|
||||
"TantivyRelevanceList",
|
||||
"WriteBatch",
|
||||
|
||||
@@ -0,0 +1,163 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import UTC
|
||||
from datetime import date
|
||||
from datetime import datetime
|
||||
from datetime import timedelta
|
||||
from typing import TYPE_CHECKING
|
||||
from typing import Final
|
||||
|
||||
from dateutil.relativedelta import relativedelta
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from datetime import tzinfo
|
||||
|
||||
_DATE_ONLY_FIELDS = frozenset({"created"})
|
||||
|
||||
_TODAY: Final[str] = "today"
|
||||
_YESTERDAY: Final[str] = "yesterday"
|
||||
_PREVIOUS_WEEK: Final[str] = "previous week"
|
||||
_THIS_MONTH: Final[str] = "this month"
|
||||
_PREVIOUS_MONTH: Final[str] = "previous month"
|
||||
_THIS_YEAR: Final[str] = "this year"
|
||||
_PREVIOUS_YEAR: Final[str] = "previous year"
|
||||
_PREVIOUS_QUARTER: Final[str] = "previous quarter"
|
||||
|
||||
_DATE_KEYWORDS = frozenset(
|
||||
{
|
||||
_TODAY,
|
||||
_YESTERDAY,
|
||||
_PREVIOUS_WEEK,
|
||||
_THIS_MONTH,
|
||||
_PREVIOUS_MONTH,
|
||||
_THIS_YEAR,
|
||||
_PREVIOUS_YEAR,
|
||||
_PREVIOUS_QUARTER,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def _fmt(dt: datetime) -> str:
|
||||
"""Format a datetime as an ISO 8601 UTC string for use in Tantivy range queries."""
|
||||
return dt.astimezone(UTC).strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||
|
||||
|
||||
def _iso_range(lo: datetime, hi: datetime) -> str:
|
||||
"""Format a [lo TO hi] range string in ISO 8601 for Tantivy query syntax."""
|
||||
return f"[{_fmt(lo)} TO {_fmt(hi)}]"
|
||||
|
||||
|
||||
def _quarter_start(d: date) -> date:
|
||||
"""Return the first day of the calendar quarter containing ``d``."""
|
||||
return date(d.year, ((d.month - 1) // 3) * 3 + 1, 1)
|
||||
|
||||
|
||||
def _midnight(d: date, tz: tzinfo) -> datetime:
|
||||
"""Convert a calendar date at local-timezone midnight to a UTC datetime."""
|
||||
return datetime(d.year, d.month, d.day, tzinfo=tz).astimezone(UTC)
|
||||
|
||||
|
||||
def _keyword_bounds(keyword: str, tz: tzinfo) -> tuple[date, date]:
|
||||
"""
|
||||
Map a relative date keyword to ``(start, exclusive_end)`` calendar dates.
|
||||
|
||||
``tz`` only determines what "today" is; the caller decides how the returned
|
||||
dates become UTC datetime boundaries (date-only vs. local-midnight offset).
|
||||
"""
|
||||
today = datetime.now(tz).date()
|
||||
if keyword == _TODAY:
|
||||
return today, today + timedelta(days=1)
|
||||
if keyword == _YESTERDAY:
|
||||
return today - timedelta(days=1), today
|
||||
if keyword == _PREVIOUS_WEEK:
|
||||
this_monday = today - timedelta(days=today.weekday())
|
||||
return this_monday - timedelta(weeks=1), this_monday
|
||||
if keyword == _THIS_MONTH:
|
||||
first = today.replace(day=1)
|
||||
return first, first + relativedelta(months=1)
|
||||
if keyword == _PREVIOUS_MONTH:
|
||||
this_first = today.replace(day=1)
|
||||
return this_first - relativedelta(months=1), this_first
|
||||
if keyword == _THIS_YEAR:
|
||||
return date(today.year, 1, 1), date(today.year + 1, 1, 1)
|
||||
if keyword == _PREVIOUS_YEAR:
|
||||
return date(today.year - 1, 1, 1), date(today.year, 1, 1)
|
||||
if keyword == _PREVIOUS_QUARTER:
|
||||
this_quarter = _quarter_start(today)
|
||||
return this_quarter - relativedelta(months=3), this_quarter
|
||||
raise ValueError(f"Unknown keyword: {keyword}")
|
||||
|
||||
|
||||
def _date_only_range(keyword: str, tz: tzinfo) -> str:
|
||||
"""
|
||||
For `created` (DateField): use the local calendar date, converted to
|
||||
midnight UTC boundaries. No offset arithmetic — date only.
|
||||
"""
|
||||
start, end = _keyword_bounds(keyword, tz)
|
||||
lo = datetime(start.year, start.month, start.day, tzinfo=UTC)
|
||||
hi = datetime(end.year, end.month, end.day, tzinfo=UTC)
|
||||
return _iso_range(lo, hi)
|
||||
|
||||
|
||||
def _datetime_range(keyword: str, tz: tzinfo) -> str:
|
||||
"""
|
||||
For `added` / `modified` (DateTimeField, stored as UTC): convert local day
|
||||
boundaries to UTC — full offset arithmetic required.
|
||||
"""
|
||||
start, end = _keyword_bounds(keyword, tz)
|
||||
return _iso_range(_midnight(start, tz), _midnight(end, tz))
|
||||
|
||||
|
||||
def _precision_bounds(digits: str) -> tuple[date, date] | None:
|
||||
"""
|
||||
Map a 4/6/8-digit date token to (start, exclusive_end) calendar dates.
|
||||
|
||||
YYYY -> whole year, YYYYMM -> whole month, YYYYMMDD -> single day.
|
||||
Returns None for any unparsable or out-of-range value (e.g. month 23),
|
||||
so callers can emit a no-match clause instead of erroring (Whoosh parity).
|
||||
"""
|
||||
try:
|
||||
if len(digits) == 4:
|
||||
year = int(digits)
|
||||
return date(year, 1, 1), date(year + 1, 1, 1)
|
||||
if len(digits) == 6:
|
||||
year, month = int(digits[:4]), int(digits[4:6])
|
||||
start = date(year, month, 1)
|
||||
end = date(year + 1, 1, 1) if month == 12 else date(year, month + 1, 1)
|
||||
return start, end
|
||||
if len(digits) == 8:
|
||||
start = date(int(digits[:4]), int(digits[4:6]), int(digits[6:8]))
|
||||
return start, start + timedelta(days=1)
|
||||
except ValueError:
|
||||
return None
|
||||
return None
|
||||
|
||||
|
||||
def _utc_bounds_for_field(
|
||||
field: str,
|
||||
start: date,
|
||||
end: date,
|
||||
tz: tzinfo,
|
||||
) -> tuple[datetime, datetime]:
|
||||
"""
|
||||
Convert calendar-date bounds to UTC datetimes per the field's storage type.
|
||||
|
||||
For DateField (``created``) the bounds are UTC midnight (no offset). For
|
||||
DateTimeField (``added``/``modified``) the bounds are local-tz midnight
|
||||
converted to UTC, matching how each field is indexed.
|
||||
"""
|
||||
if field in _DATE_ONLY_FIELDS:
|
||||
return (
|
||||
datetime(start.year, start.month, start.day, tzinfo=UTC),
|
||||
datetime(end.year, end.month, end.day, tzinfo=UTC),
|
||||
)
|
||||
return (
|
||||
datetime(start.year, start.month, start.day, tzinfo=tz).astimezone(UTC),
|
||||
datetime(end.year, end.month, end.day, tzinfo=tz).astimezone(UTC),
|
||||
)
|
||||
|
||||
|
||||
def _field_range_from_dates(field: str, start: date, end: date, tz: tzinfo) -> str:
|
||||
"""Build a Tantivy ``field:[lo TO hi]`` ISO range from calendar-date bounds."""
|
||||
lo, hi = _utc_bounds_for_field(field, start, end, tz)
|
||||
return f"{field}:{_iso_range(lo, hi)}"
|
||||
+27
-405
@@ -1,88 +1,35 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from datetime import UTC
|
||||
from datetime import date
|
||||
from datetime import datetime
|
||||
from datetime import timedelta
|
||||
from typing import TYPE_CHECKING
|
||||
from typing import Final
|
||||
|
||||
import regex
|
||||
import tantivy
|
||||
from dateutil.relativedelta import relativedelta
|
||||
from django.conf import settings
|
||||
|
||||
from documents.search._dates import (
|
||||
_date_only_range, # noqa: F401 — re-exported for test imports
|
||||
)
|
||||
from documents.search._dates import (
|
||||
_datetime_range, # noqa: F401 — re-exported for test imports
|
||||
)
|
||||
from documents.search._tokenizer import simple_search_tokens
|
||||
from documents.search._translate import SearchQueryError
|
||||
from documents.search._translate import translate_query
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from datetime import tzinfo
|
||||
|
||||
from django.contrib.auth.base_user import AbstractBaseUser
|
||||
|
||||
logger = logging.getLogger("paperless.search")
|
||||
|
||||
# Maximum seconds any single regex substitution may run.
|
||||
# Prevents ReDoS on adversarial user-supplied query strings.
|
||||
_REGEX_TIMEOUT: Final[float] = 1.0
|
||||
|
||||
_DATE_ONLY_FIELDS = frozenset({"created"})
|
||||
|
||||
_TODAY: Final[str] = "today"
|
||||
_YESTERDAY: Final[str] = "yesterday"
|
||||
_PREVIOUS_WEEK: Final[str] = "previous week"
|
||||
_THIS_MONTH: Final[str] = "this month"
|
||||
_PREVIOUS_MONTH: Final[str] = "previous month"
|
||||
_THIS_YEAR: Final[str] = "this year"
|
||||
_PREVIOUS_YEAR: Final[str] = "previous year"
|
||||
_PREVIOUS_QUARTER: Final[str] = "previous quarter"
|
||||
|
||||
_DATE_KEYWORDS = frozenset(
|
||||
{
|
||||
_TODAY,
|
||||
_YESTERDAY,
|
||||
_PREVIOUS_WEEK,
|
||||
_THIS_MONTH,
|
||||
_PREVIOUS_MONTH,
|
||||
_THIS_YEAR,
|
||||
_PREVIOUS_YEAR,
|
||||
_PREVIOUS_QUARTER,
|
||||
},
|
||||
)
|
||||
|
||||
_DATE_KEYWORD_PATTERN = "|".join(
|
||||
sorted((regex.escape(k) for k in _DATE_KEYWORDS), key=len, reverse=True),
|
||||
)
|
||||
|
||||
_FIELD_DATE_RE = regex.compile(
|
||||
rf"""(?<!\w)(?P<field>created|modified|added)\s*:\s*(?:
|
||||
(?P<quote>["'])(?P<quoted>{_DATE_KEYWORD_PATTERN})(?P=quote)
|
||||
|
|
||||
(?P<bare>{_DATE_KEYWORD_PATTERN})(?![\w-])
|
||||
)""",
|
||||
regex.IGNORECASE | regex.VERBOSE,
|
||||
)
|
||||
_COMPACT_DATE_RE = regex.compile(r"\b(\d{14})\b")
|
||||
_RELATIVE_RANGE_RE = regex.compile(
|
||||
r"\[now([+-]\d+[dhm])?\s+TO\s+now([+-]\d+[dhm])?\]",
|
||||
regex.IGNORECASE,
|
||||
)
|
||||
# Whoosh-style relative date range: e.g. [-1 week to now], [-7 days to now]
|
||||
_WHOOSH_REL_RANGE_RE = regex.compile(
|
||||
r"\[-(?P<n>\d+)\s+(?P<unit>second|minute|hour|day|week|month|year)s?\s+to\s+now\]",
|
||||
regex.IGNORECASE,
|
||||
)
|
||||
# Whoosh-style 8-digit date: field:YYYYMMDD — field-aware so timezone can be applied correctly.
|
||||
# Scoped to date fields only; numeric fields (asn, id, page_count, ...) must not be rewritten.
|
||||
_DATE8_RE = regex.compile(
|
||||
r"(?<!\w)(?P<field>created|modified|added):(?P<date8>\d{8})\b",
|
||||
)
|
||||
_YEAR_RANGE_RE = regex.compile(
|
||||
r"(?<!\w)(?P<field>created|modified|added):\[(?P<y1>\d{4})\s+TO\s+(?P<y2>\d{4})\]",
|
||||
regex.IGNORECASE,
|
||||
)
|
||||
# Tantivy syntax error: " - " and " + " with spaces on both sides are invalid because
|
||||
# the NOT/MUST operators require no space between the operator and the term.
|
||||
# In natural-language queries (e.g., "H52.1 - Kurzsichtigkeit"), the dash is a separator.
|
||||
_SPACED_OPERATOR_RE = regex.compile(r"\s+[-+]\s+")
|
||||
_TRAILING_OPERATOR_RE = regex.compile(r"\s+[-+]+\s*$")
|
||||
# Matches CJK/Hangul characters so queries can be routed to bigram fields.
|
||||
# Uses Unicode properties to cover all blocks including Extension B+ planes.
|
||||
_CJK_RE: Final = regex.compile(r"[\p{Han}\p{Hiragana}\p{Katakana}\p{Hangul}]+")
|
||||
@@ -117,303 +64,12 @@ def _build_cjk_query(
|
||||
return None
|
||||
|
||||
|
||||
def _fmt(dt: datetime) -> str:
|
||||
"""Format a datetime as an ISO 8601 UTC string for use in Tantivy range queries."""
|
||||
return dt.astimezone(UTC).strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||
|
||||
|
||||
def _iso_range(lo: datetime, hi: datetime) -> str:
|
||||
"""Format a [lo TO hi] range string in ISO 8601 for Tantivy query syntax."""
|
||||
return f"[{_fmt(lo)} TO {_fmt(hi)}]"
|
||||
|
||||
|
||||
def _date_only_range(keyword: str, tz: tzinfo) -> str:
|
||||
"""
|
||||
For `created` (DateField): use the local calendar date, converted to
|
||||
midnight UTC boundaries. No offset arithmetic — date only.
|
||||
"""
|
||||
|
||||
today = datetime.now(tz).date()
|
||||
|
||||
def _quarter_start(d: date) -> date:
|
||||
return date(d.year, ((d.month - 1) // 3) * 3 + 1, 1)
|
||||
|
||||
if keyword == _TODAY:
|
||||
lo = datetime(today.year, today.month, today.day, tzinfo=UTC)
|
||||
return _iso_range(lo, lo + timedelta(days=1))
|
||||
if keyword == _YESTERDAY:
|
||||
y = today - timedelta(days=1)
|
||||
lo = datetime(y.year, y.month, y.day, tzinfo=UTC)
|
||||
hi = datetime(today.year, today.month, today.day, tzinfo=UTC)
|
||||
return _iso_range(lo, hi)
|
||||
if keyword == _PREVIOUS_WEEK:
|
||||
this_mon = today - timedelta(days=today.weekday())
|
||||
last_mon = this_mon - timedelta(weeks=1)
|
||||
lo = datetime(last_mon.year, last_mon.month, last_mon.day, tzinfo=UTC)
|
||||
hi = datetime(this_mon.year, this_mon.month, this_mon.day, tzinfo=UTC)
|
||||
return _iso_range(lo, hi)
|
||||
if keyword == _THIS_MONTH:
|
||||
lo = datetime(today.year, today.month, 1, tzinfo=UTC)
|
||||
if today.month == 12:
|
||||
hi = datetime(today.year + 1, 1, 1, tzinfo=UTC)
|
||||
else:
|
||||
hi = datetime(today.year, today.month + 1, 1, tzinfo=UTC)
|
||||
return _iso_range(lo, hi)
|
||||
if keyword == _PREVIOUS_MONTH:
|
||||
if today.month == 1:
|
||||
lo = datetime(today.year - 1, 12, 1, tzinfo=UTC)
|
||||
else:
|
||||
lo = datetime(today.year, today.month - 1, 1, tzinfo=UTC)
|
||||
hi = datetime(today.year, today.month, 1, tzinfo=UTC)
|
||||
return _iso_range(lo, hi)
|
||||
if keyword == _THIS_YEAR:
|
||||
lo = datetime(today.year, 1, 1, tzinfo=UTC)
|
||||
return _iso_range(lo, datetime(today.year + 1, 1, 1, tzinfo=UTC))
|
||||
if keyword == _PREVIOUS_YEAR:
|
||||
lo = datetime(today.year - 1, 1, 1, tzinfo=UTC)
|
||||
return _iso_range(lo, datetime(today.year, 1, 1, tzinfo=UTC))
|
||||
if keyword == _PREVIOUS_QUARTER:
|
||||
this_quarter = _quarter_start(today)
|
||||
last_quarter = this_quarter - relativedelta(months=3)
|
||||
lo = datetime(
|
||||
last_quarter.year,
|
||||
last_quarter.month,
|
||||
last_quarter.day,
|
||||
tzinfo=UTC,
|
||||
)
|
||||
hi = datetime(
|
||||
this_quarter.year,
|
||||
this_quarter.month,
|
||||
this_quarter.day,
|
||||
tzinfo=UTC,
|
||||
)
|
||||
return _iso_range(lo, hi)
|
||||
raise ValueError(f"Unknown keyword: {keyword}")
|
||||
|
||||
|
||||
def _datetime_range(keyword: str, tz: tzinfo) -> str:
|
||||
"""
|
||||
For `added` / `modified` (DateTimeField, stored as UTC): convert local day
|
||||
boundaries to UTC — full offset arithmetic required.
|
||||
"""
|
||||
|
||||
now_local = datetime.now(tz)
|
||||
today = now_local.date()
|
||||
|
||||
def _midnight(d: date) -> datetime:
|
||||
return datetime(d.year, d.month, d.day, tzinfo=tz).astimezone(UTC)
|
||||
|
||||
def _quarter_start(d: date) -> date:
|
||||
return date(d.year, ((d.month - 1) // 3) * 3 + 1, 1)
|
||||
|
||||
if keyword == _TODAY:
|
||||
return _iso_range(_midnight(today), _midnight(today + timedelta(days=1)))
|
||||
if keyword == _YESTERDAY:
|
||||
y = today - timedelta(days=1)
|
||||
return _iso_range(_midnight(y), _midnight(today))
|
||||
if keyword == _PREVIOUS_WEEK:
|
||||
this_mon = today - timedelta(days=today.weekday())
|
||||
last_mon = this_mon - timedelta(weeks=1)
|
||||
return _iso_range(_midnight(last_mon), _midnight(this_mon))
|
||||
if keyword == _THIS_MONTH:
|
||||
first = today.replace(day=1)
|
||||
if today.month == 12:
|
||||
next_first = date(today.year + 1, 1, 1)
|
||||
else:
|
||||
next_first = date(today.year, today.month + 1, 1)
|
||||
return _iso_range(_midnight(first), _midnight(next_first))
|
||||
if keyword == _PREVIOUS_MONTH:
|
||||
this_first = today.replace(day=1)
|
||||
if today.month == 1:
|
||||
last_first = date(today.year - 1, 12, 1)
|
||||
else:
|
||||
last_first = date(today.year, today.month - 1, 1)
|
||||
return _iso_range(_midnight(last_first), _midnight(this_first))
|
||||
if keyword == _THIS_YEAR:
|
||||
return _iso_range(
|
||||
_midnight(date(today.year, 1, 1)),
|
||||
_midnight(date(today.year + 1, 1, 1)),
|
||||
)
|
||||
if keyword == _PREVIOUS_YEAR:
|
||||
return _iso_range(
|
||||
_midnight(date(today.year - 1, 1, 1)),
|
||||
_midnight(date(today.year, 1, 1)),
|
||||
)
|
||||
if keyword == _PREVIOUS_QUARTER:
|
||||
this_quarter = _quarter_start(today)
|
||||
last_quarter = this_quarter - relativedelta(months=3)
|
||||
return _iso_range(_midnight(last_quarter), _midnight(this_quarter))
|
||||
raise ValueError(f"Unknown keyword: {keyword}")
|
||||
|
||||
|
||||
def _rewrite_compact_date(query: str) -> str:
|
||||
"""Rewrite Whoosh compact date tokens (14-digit YYYYMMDDHHmmss) to ISO 8601."""
|
||||
|
||||
def _sub(m: regex.Match[str]) -> str:
|
||||
raw = m.group(1)
|
||||
try:
|
||||
dt = datetime(
|
||||
int(raw[0:4]),
|
||||
int(raw[4:6]),
|
||||
int(raw[6:8]),
|
||||
int(raw[8:10]),
|
||||
int(raw[10:12]),
|
||||
int(raw[12:14]),
|
||||
tzinfo=UTC,
|
||||
)
|
||||
return dt.strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||
except ValueError:
|
||||
return str(m.group(0))
|
||||
|
||||
try:
|
||||
return _COMPACT_DATE_RE.sub(_sub, query, timeout=_REGEX_TIMEOUT)
|
||||
except TimeoutError: # pragma: no cover
|
||||
raise ValueError(
|
||||
"Query too complex to process (compact date rewrite timed out)",
|
||||
)
|
||||
|
||||
|
||||
def _rewrite_relative_range(query: str) -> str:
|
||||
"""Rewrite Whoosh relative ranges ([now-7d TO now]) to concrete ISO 8601 UTC boundaries."""
|
||||
|
||||
def _sub(m: regex.Match[str]) -> str:
|
||||
now = datetime.now(UTC)
|
||||
|
||||
def _offset(s: str | None) -> timedelta:
|
||||
if not s:
|
||||
return timedelta(0)
|
||||
sign = 1 if s[0] == "+" else -1
|
||||
n, unit = int(s[1:-1]), s[-1]
|
||||
return (
|
||||
sign
|
||||
* {
|
||||
"d": timedelta(days=n),
|
||||
"h": timedelta(hours=n),
|
||||
"m": timedelta(minutes=n),
|
||||
}[unit]
|
||||
)
|
||||
|
||||
lo, hi = now + _offset(m.group(1)), now + _offset(m.group(2))
|
||||
if lo > hi:
|
||||
lo, hi = hi, lo
|
||||
return f"[{_fmt(lo)} TO {_fmt(hi)}]"
|
||||
|
||||
try:
|
||||
return _RELATIVE_RANGE_RE.sub(_sub, query, timeout=_REGEX_TIMEOUT)
|
||||
except TimeoutError: # pragma: no cover
|
||||
raise ValueError(
|
||||
"Query too complex to process (relative range rewrite timed out)",
|
||||
)
|
||||
|
||||
|
||||
def _rewrite_whoosh_relative_range(query: str) -> str:
|
||||
"""Rewrite Whoosh-style relative date ranges ([-N unit to now]) to ISO 8601.
|
||||
|
||||
Supports: second, minute, hour, day, week, month, year (singular and plural).
|
||||
Example: ``added:[-1 week to now]`` → ``added:[2025-01-01T… TO 2025-01-08T…]``
|
||||
"""
|
||||
now = datetime.now(UTC)
|
||||
|
||||
def _sub(m: regex.Match[str]) -> str:
|
||||
n = int(m.group("n"))
|
||||
unit = m.group("unit").lower()
|
||||
delta_map: dict[str, timedelta | relativedelta] = {
|
||||
"second": timedelta(seconds=n),
|
||||
"minute": timedelta(minutes=n),
|
||||
"hour": timedelta(hours=n),
|
||||
"day": timedelta(days=n),
|
||||
"week": timedelta(weeks=n),
|
||||
"month": relativedelta(months=n),
|
||||
"year": relativedelta(years=n),
|
||||
}
|
||||
lo = now - delta_map[unit]
|
||||
return f"[{_fmt(lo)} TO {_fmt(now)}]"
|
||||
|
||||
try:
|
||||
return _WHOOSH_REL_RANGE_RE.sub(_sub, query, timeout=_REGEX_TIMEOUT)
|
||||
except TimeoutError: # pragma: no cover
|
||||
raise ValueError(
|
||||
"Query too complex to process (Whoosh relative range rewrite timed out)",
|
||||
)
|
||||
|
||||
|
||||
def _rewrite_8digit_date(query: str, tz: tzinfo) -> str:
|
||||
"""Rewrite field:YYYYMMDD date tokens to an ISO 8601 day range.
|
||||
|
||||
Runs after ``_rewrite_compact_date`` so 14-digit timestamps are already
|
||||
converted and won't spuriously match here.
|
||||
|
||||
For DateField fields (e.g. ``created``) uses UTC midnight boundaries.
|
||||
For DateTimeField fields (e.g. ``added``, ``modified``) uses local TZ
|
||||
midnight boundaries converted to UTC — matching the ``_datetime_range``
|
||||
behaviour for keyword dates.
|
||||
"""
|
||||
|
||||
def _sub(m: regex.Match[str]) -> str:
|
||||
field = m.group("field")
|
||||
raw = m.group("date8")
|
||||
try:
|
||||
year, month, day = int(raw[0:4]), int(raw[4:6]), int(raw[6:8])
|
||||
d = date(year, month, day)
|
||||
if field in _DATE_ONLY_FIELDS:
|
||||
lo = datetime(d.year, d.month, d.day, tzinfo=UTC)
|
||||
hi = lo + timedelta(days=1)
|
||||
else:
|
||||
# DateTimeField: use local-timezone midnight → UTC
|
||||
lo = datetime(d.year, d.month, d.day, tzinfo=tz).astimezone(UTC)
|
||||
hi = datetime(
|
||||
(d + timedelta(days=1)).year,
|
||||
(d + timedelta(days=1)).month,
|
||||
(d + timedelta(days=1)).day,
|
||||
tzinfo=tz,
|
||||
).astimezone(UTC)
|
||||
return f"{field}:[{_fmt(lo)} TO {_fmt(hi)}]"
|
||||
except ValueError:
|
||||
return m.group(0)
|
||||
|
||||
try:
|
||||
return _DATE8_RE.sub(_sub, query, timeout=_REGEX_TIMEOUT)
|
||||
except TimeoutError: # pragma: no cover
|
||||
raise ValueError(
|
||||
"Query too complex to process (8-digit date rewrite timed out)",
|
||||
)
|
||||
|
||||
|
||||
def _rewrite_year_range(query: str) -> str:
|
||||
"""Rewrite Whoosh-style year-only date ranges to ISO 8601 UTC boundaries.
|
||||
|
||||
Converts ``field:[YYYY TO YYYY]`` to a full ISO 8601 datetime range.
|
||||
The upper bound is the start of the year after the end year (exclusive),
|
||||
matching the Whoosh convention of treating year-only ranges as full-year spans.
|
||||
"""
|
||||
|
||||
def _sub(m: regex.Match[str]) -> str:
|
||||
field = m.group("field")
|
||||
y1, y2 = int(m.group("y1")), int(m.group("y2"))
|
||||
# Whoosh swaps a reversed range when both years are explicit
|
||||
# (whoosh.util.times.timespan.disambiguated); match that so a backwards
|
||||
# range spans the intended years instead of matching nothing.
|
||||
lo_year, hi_year = min(y1, y2), max(y1, y2)
|
||||
lo = datetime(lo_year, 1, 1, tzinfo=UTC)
|
||||
hi = datetime(hi_year + 1, 1, 1, tzinfo=UTC)
|
||||
return f"{field}:[{_fmt(lo)} TO {_fmt(hi)}]"
|
||||
|
||||
try:
|
||||
return _YEAR_RANGE_RE.sub(_sub, query, timeout=_REGEX_TIMEOUT)
|
||||
except TimeoutError: # pragma: no cover
|
||||
raise ValueError("Query too complex to process (year range rewrite timed out)")
|
||||
|
||||
|
||||
def rewrite_natural_date_keywords(query: str, tz: tzinfo) -> str:
|
||||
"""
|
||||
Rewrite natural date syntax to ISO 8601 format for Tantivy compatibility.
|
||||
|
||||
Performs the first stage of query preprocessing, converting various date
|
||||
formats and keywords to ISO 8601 datetime ranges that Tantivy can parse:
|
||||
- Compact 14-digit dates (YYYYMMDDHHmmss)
|
||||
- Whoosh relative ranges ([-7 days to now], [now-1h TO now+2h])
|
||||
- 8-digit dates with field awareness (created:20240115)
|
||||
- Natural keywords (field:today, field:"previous quarter", etc.)
|
||||
Delegates to ``translate_query`` which handles all date forms, comma
|
||||
expansion, field aliasing, relative ranges, and operator normalization.
|
||||
|
||||
Args:
|
||||
query: Raw user query string
|
||||
@@ -425,35 +81,15 @@ def rewrite_natural_date_keywords(query: str, tz: tzinfo) -> str:
|
||||
Note:
|
||||
Bare keywords without field prefixes pass through unchanged.
|
||||
"""
|
||||
query = _rewrite_compact_date(query)
|
||||
query = _rewrite_whoosh_relative_range(query)
|
||||
query = _rewrite_year_range(query)
|
||||
query = _rewrite_8digit_date(query, tz)
|
||||
query = _rewrite_relative_range(query)
|
||||
|
||||
def _replace(m: regex.Match[str]) -> str:
|
||||
field = m.group("field")
|
||||
keyword = (m.group("quoted") or m.group("bare")).lower()
|
||||
if field in _DATE_ONLY_FIELDS:
|
||||
return f"{field}:{_date_only_range(keyword, tz)}"
|
||||
return f"{field}:{_datetime_range(keyword, tz)}"
|
||||
|
||||
try:
|
||||
return _FIELD_DATE_RE.sub(_replace, query, timeout=_REGEX_TIMEOUT)
|
||||
except TimeoutError: # pragma: no cover
|
||||
raise ValueError(
|
||||
"Query too complex to process (date keyword rewrite timed out)",
|
||||
)
|
||||
return translate_query(query, tz)
|
||||
|
||||
|
||||
def normalize_query(query: str) -> str:
|
||||
"""
|
||||
Normalize query syntax for better search behavior.
|
||||
|
||||
Expands comma-separated field values to explicit AND clauses and
|
||||
collapses excessive whitespace for cleaner parsing:
|
||||
- tag:foo,bar → tag:foo AND tag:bar
|
||||
- multiple spaces → single spaces
|
||||
Delegates to ``translate_query`` which handles comma expansion, whitespace
|
||||
collapsing, operator normalization, and field aliasing.
|
||||
|
||||
Args:
|
||||
query: Query string after date rewriting
|
||||
@@ -461,29 +97,7 @@ def normalize_query(query: str) -> str:
|
||||
Returns:
|
||||
Normalized query string ready for Tantivy parsing
|
||||
"""
|
||||
|
||||
def _expand(m: regex.Match[str]) -> str:
|
||||
field = m.group(1)
|
||||
values = [v.strip() for v in m.group(2).split(",") if v.strip()]
|
||||
return " AND ".join(f"{field}:{v}" for v in values)
|
||||
|
||||
try:
|
||||
query = regex.sub(
|
||||
r"(\w+):([^\s\[\]]+(?:,[^\s\[\]]+)+)",
|
||||
_expand,
|
||||
query,
|
||||
timeout=_REGEX_TIMEOUT,
|
||||
)
|
||||
query = regex.sub(r" {2,}", " ", query, timeout=_REGEX_TIMEOUT).strip()
|
||||
# Strip trailing dangling operators before Tantivy sees them.
|
||||
query = _TRAILING_OPERATOR_RE.sub("", query, timeout=_REGEX_TIMEOUT).strip()
|
||||
# Replace " - " / " + " with a space: Tantivy requires no space between
|
||||
# the operator and its operand (-term / +term), so spaces on both sides
|
||||
# means this is a natural-language separator, not a query operator.
|
||||
query = _SPACED_OPERATOR_RE.sub(" ", query, timeout=_REGEX_TIMEOUT).strip()
|
||||
return query
|
||||
except TimeoutError: # pragma: no cover
|
||||
raise ValueError("Query too complex to process (normalization timed out)")
|
||||
return translate_query(query, UTC)
|
||||
|
||||
|
||||
def build_permission_filter(
|
||||
@@ -603,8 +217,16 @@ def parse_user_query(
|
||||
as a post-search score filter, not during query construction.
|
||||
"""
|
||||
|
||||
query_str = rewrite_natural_date_keywords(raw_query, tz)
|
||||
query_str = normalize_query(query_str)
|
||||
try:
|
||||
query_str = translate_query(raw_query, tz)
|
||||
except SearchQueryError:
|
||||
# Intentional, user-fixable error (e.g. an unparsable date). Propagate so
|
||||
# the view can return a 400 with a helpful message rather than falling
|
||||
# back to the raw (still-invalid) query.
|
||||
raise
|
||||
except Exception: # pragma: no cover - defensive
|
||||
logger.warning("Query translation failed; using raw query", exc_info=True)
|
||||
query_str = raw_query
|
||||
|
||||
exact = index.parse_query(
|
||||
query_str,
|
||||
|
||||
@@ -0,0 +1,566 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from datetime import UTC
|
||||
from datetime import datetime
|
||||
from datetime import timedelta
|
||||
from typing import TYPE_CHECKING
|
||||
from typing import TypeAlias
|
||||
|
||||
import regex
|
||||
from dateutil.relativedelta import relativedelta
|
||||
|
||||
from documents.search._dates import _DATE_KEYWORDS
|
||||
from documents.search._dates import _DATE_ONLY_FIELDS
|
||||
from documents.search._dates import _date_only_range
|
||||
from documents.search._dates import _datetime_range
|
||||
from documents.search._dates import _field_range_from_dates
|
||||
from documents.search._dates import _fmt
|
||||
from documents.search._dates import _precision_bounds
|
||||
from documents.search._dates import _utc_bounds_for_field
|
||||
|
||||
# Compiled regex that matches any known multi-word (or single-word) date keyword
|
||||
# at the start of a match position, longest alternatives first so "previous week"
|
||||
# wins over a hypothetical shorter "previous".
|
||||
_KEYWORD_VALUE_RE = regex.compile(
|
||||
"|".join(sorted((regex.escape(k) for k in _DATE_KEYWORDS), key=len, reverse=True)),
|
||||
regex.IGNORECASE,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from datetime import tzinfo
|
||||
|
||||
# TODO: this module translates date queries into Tantivy *string* syntax, which
|
||||
# forces a workaround for something Tantivy's string parser cannot express on
|
||||
# date fields: open-ended ranges use far-past/far-future string sentinels
|
||||
# (OPEN_LO/OPEN_HI). These can be replaced with a real tantivy.Query object
|
||||
# (Query.range_query(..., None) for open bounds) once tantivy-py accepts Python
|
||||
# datetimes in range_query/term_query on Date fields. That support exists on
|
||||
# tantivy-py master (PRs #655 + #666) but postdates the pinned 0.26.0 wheel, so
|
||||
# it is blocked only on a published release > 0.26.0 and a dependency bump.
|
||||
# (Unparsable dates now raise InvalidDateQuery -> HTTP 400 rather than using a
|
||||
# no-match string sentinel.)
|
||||
|
||||
# Fields that store exact, non-analyzed comma-joined tokens in the index and so
|
||||
# need explicit comma->AND expansion (Whoosh KEYWORD(commas=True) set).
|
||||
MULTI_VALUE_FIELDS = frozenset({"tag", "tag_id", "viewer_id"})
|
||||
|
||||
# Date fields whose values/ranges get rewritten to RFC3339 Tantivy ranges.
|
||||
DATE_FIELDS = frozenset({"created", "modified", "added"})
|
||||
|
||||
# Field aliases: Whoosh (v2) field names that were renamed in the Tantivy schema.
|
||||
# Preserved here so v2 queries using the old names continue to work without 400
|
||||
# errors instead of silently failing. Applied by _render to non-date field tokens.
|
||||
FIELD_ALIASES: dict[str, str] = {
|
||||
"type": "document_type",
|
||||
"type_id": "document_type_id",
|
||||
"path": "storage_path",
|
||||
"path_id": "storage_path_id",
|
||||
}
|
||||
|
||||
# Known schema fields: a comma immediately followed by ``<known>:`` is a clause
|
||||
# separator. Restricting to known fields prevents URL-like ``http:`` misfires.
|
||||
KNOWN_FIELDS = frozenset(
|
||||
{
|
||||
"title",
|
||||
"content",
|
||||
"correspondent",
|
||||
"document_type",
|
||||
"type", # v2 alias -> document_type
|
||||
"storage_path",
|
||||
"path", # v2 alias -> storage_path
|
||||
"tag",
|
||||
"tag_id",
|
||||
"correspondent_id",
|
||||
"document_type_id",
|
||||
"type_id", # v2 alias -> document_type_id
|
||||
"storage_path_id",
|
||||
"path_id", # v2 alias -> storage_path_id
|
||||
"owner_id",
|
||||
"viewer_id",
|
||||
"asn",
|
||||
"page_count",
|
||||
"num_notes",
|
||||
"created",
|
||||
"modified",
|
||||
"added",
|
||||
"original_filename",
|
||||
"checksum",
|
||||
"notes",
|
||||
"custom_fields",
|
||||
},
|
||||
)
|
||||
|
||||
_FIELD_RE = regex.compile(r"(?P<field>\w+):")
|
||||
|
||||
# Matches the TO separator inside a range bracket. Handles three forms:
|
||||
# middle: "lo TO hi" (either lo or hi may be empty)
|
||||
# trailing: "lo TO" (open upper bound)
|
||||
# leading: "TO hi" (open lower bound)
|
||||
# Bounds MAY contain internal spaces (e.g. "-7 days"), so we use .*? / .+?
|
||||
# and split on the whitespace-delimited " TO " / " to " separator.
|
||||
_RANGE_RE = regex.compile(
|
||||
r"^\s*(?P<lo>.*?)\s+[Tt][Oo]\s+(?P<hi>.+?)\s*$"
|
||||
r"|"
|
||||
r"^\s*(?P<lo2>.+?)\s+[Tt][Oo]\s*$"
|
||||
r"|"
|
||||
r"^\s*[Tt][Oo]\s+(?P<hi2>.+?)\s*$",
|
||||
)
|
||||
|
||||
|
||||
@dataclass(frozen=True, slots=True)
|
||||
class FieldValue:
|
||||
field: str
|
||||
value: str
|
||||
|
||||
|
||||
# Produced by the comma-resolution pass (not by scan()).
|
||||
@dataclass(frozen=True, slots=True)
|
||||
class FieldValueList:
|
||||
field: str
|
||||
values: tuple[str, ...]
|
||||
|
||||
|
||||
@dataclass(frozen=True, slots=True)
|
||||
class FieldRange:
|
||||
field: str
|
||||
open: str
|
||||
lo: str
|
||||
hi: str
|
||||
close: str
|
||||
|
||||
|
||||
# Produced by the comma-resolution pass (not by scan()).
|
||||
@dataclass(frozen=True, slots=True)
|
||||
class Comma:
|
||||
pass
|
||||
|
||||
|
||||
@dataclass(frozen=True, slots=True)
|
||||
class Passthrough:
|
||||
raw: str
|
||||
|
||||
|
||||
Token: TypeAlias = FieldValue | FieldValueList | FieldRange | Comma | Passthrough
|
||||
|
||||
_CLOSE: dict[str, str] = {"[": "]", "{": "}"}
|
||||
|
||||
|
||||
def scan(query: str) -> list[Token]:
|
||||
"""
|
||||
Tokenize a raw query into date/comma-aware tokens, leaving everything else
|
||||
as verbatim ``Passthrough`` runs. Non-recursive: finds the first matching
|
||||
close bracket/quote. Nested brackets are not valid Tantivy range syntax and
|
||||
pass through verbatim on mismatch.
|
||||
"""
|
||||
tokens: list[Token] = []
|
||||
buf: list[str] = [] # accumulates passthrough chars
|
||||
i, n = 0, len(query)
|
||||
while i < n:
|
||||
matched = _match_field_token(query, i)
|
||||
if matched is None:
|
||||
buf.append(query[i])
|
||||
i += 1
|
||||
continue
|
||||
token, i = matched
|
||||
_flush(buf, tokens)
|
||||
tokens.append(token)
|
||||
i = _maybe_comma(query, i, tokens)
|
||||
_flush(buf, tokens)
|
||||
return tokens
|
||||
|
||||
|
||||
def _flush(buf: list[str], tokens: list[Token]) -> None:
|
||||
"""Emit any accumulated passthrough characters as a single token."""
|
||||
if buf:
|
||||
tokens.append(Passthrough("".join(buf)))
|
||||
buf.clear()
|
||||
|
||||
|
||||
def _at_word_boundary(query: str, i: int) -> bool:
|
||||
"""A field token may begin only at the start or after a non-word character."""
|
||||
return i == 0 or not (query[i - 1].isalnum() or query[i - 1] == "_")
|
||||
|
||||
|
||||
def _match_field_token(query: str, i: int) -> tuple[Token, int] | None:
|
||||
"""
|
||||
If a known ``field:`` token starts at ``i``, consume it and return
|
||||
``(token, end_index)``; otherwise return None so the caller treats the
|
||||
character as passthrough. Handles both ``field:[range]`` and ``field:value``,
|
||||
and returns None when the range/value cannot be consumed.
|
||||
"""
|
||||
m = _FIELD_RE.match(query, i)
|
||||
if m is None or m.group("field") not in KNOWN_FIELDS:
|
||||
return None
|
||||
if not _at_word_boundary(query, i):
|
||||
return None
|
||||
field = m.group("field")
|
||||
j = m.end()
|
||||
if j < len(query) and query[j] in "[{":
|
||||
return _consume_range(query, j, field)
|
||||
consumed = _consume_field_value(query, field, j)
|
||||
if consumed is None:
|
||||
return None
|
||||
value, end = consumed
|
||||
return FieldValue(field, value), end
|
||||
|
||||
|
||||
def _consume_field_value(query: str, field: str, start: int) -> tuple[str, int] | None:
|
||||
"""
|
||||
Consume a field value starting at ``start``: a multi-word date keyword phrase
|
||||
(date fields only), or a bare/quoted value, then absorb any comma-joined
|
||||
continuation that is not a clause separator. ``resolve_commas`` later splits a
|
||||
multi-value field's joined value into a ``FieldValueList``; for other fields
|
||||
the comma stays literal.
|
||||
"""
|
||||
n = len(query)
|
||||
consumed = None
|
||||
if field in DATE_FIELDS:
|
||||
km = _KEYWORD_VALUE_RE.match(query, start)
|
||||
if km is not None and (km.end() >= n or query[km.end()] in " \t),"):
|
||||
consumed = (km.group(0), km.end())
|
||||
if consumed is None:
|
||||
consumed = _consume_value(query, start)
|
||||
if consumed is None:
|
||||
return None
|
||||
value, k = consumed
|
||||
while k < n and query[k] == ",":
|
||||
if _looks_like_known_field(query, k + 1):
|
||||
break # clause separator: left for _maybe_comma to emit a Comma()
|
||||
more = _consume_value(query, k + 1)
|
||||
if more is None:
|
||||
break
|
||||
value = f"{value},{more[0]}"
|
||||
k = more[1]
|
||||
return value, k
|
||||
|
||||
|
||||
def _consume_range(
|
||||
query: str,
|
||||
start: int,
|
||||
field: str,
|
||||
) -> tuple[FieldRange, int] | None:
|
||||
"""Consume ``[lo TO hi]`` / ``{lo TO hi}`` from ``start`` (the bracket)."""
|
||||
open_br = query[start]
|
||||
close_br = _CLOSE[open_br]
|
||||
end = query.find(close_br, start + 1)
|
||||
if end == -1:
|
||||
return None
|
||||
inner = query[start + 1 : end]
|
||||
m = _RANGE_RE.match(inner)
|
||||
if m is not None:
|
||||
if m.group("lo") is not None or m.group("hi") is not None:
|
||||
# Middle form: "lo TO hi" (either may be empty string)
|
||||
lo = (m.group("lo") or "").strip()
|
||||
hi = (m.group("hi") or "").strip()
|
||||
elif m.group("lo2") is not None:
|
||||
# Trailing form: "lo TO"
|
||||
lo = m.group("lo2").strip()
|
||||
hi = ""
|
||||
else:
|
||||
# Leading form: "TO hi"
|
||||
lo = ""
|
||||
hi = (m.group("hi2") or "").strip()
|
||||
else:
|
||||
lo, hi = inner.strip(), ""
|
||||
return FieldRange(field, open_br, lo, hi, close_br), end + 1
|
||||
|
||||
|
||||
def _consume_value(query: str, start: int) -> tuple[str, int] | None:
|
||||
"""Consume a bare or quoted field value from ``start``, stopping at comma."""
|
||||
n = len(query)
|
||||
if start >= n or query[start] in " \t":
|
||||
return None
|
||||
if query[start] in "\"'":
|
||||
quote = query[start]
|
||||
end = query.find(quote, start + 1)
|
||||
if end == -1:
|
||||
return None
|
||||
return query[start : end + 1], end + 1
|
||||
j = start
|
||||
while j < n and query[j] not in " \t),":
|
||||
j += 1
|
||||
return query[start:j], j
|
||||
|
||||
|
||||
def _looks_like_known_field(query: str, pos: int) -> bool:
|
||||
"""True if a known ``field:`` token starts at ``pos``."""
|
||||
m = _FIELD_RE.match(query, pos)
|
||||
return bool(m and m.group("field") in KNOWN_FIELDS)
|
||||
|
||||
|
||||
def _maybe_comma(query: str, i: int, tokens: list) -> int:
|
||||
"""If a clause-separator comma follows at ``i``, emit ``Comma()`` and advance."""
|
||||
if i < len(query) and query[i] == "," and _looks_like_known_field(query, i + 1):
|
||||
tokens.append(Comma())
|
||||
return i + 1
|
||||
return i
|
||||
|
||||
|
||||
def resolve_commas(tokens: list) -> list:
|
||||
"""
|
||||
Collapse value-list commas into ``FieldValueList`` and keep clause-separator
|
||||
commas as ``Comma``. (Clause-sep commas are already emitted by ``scan`` via
|
||||
the value-stop logic; this pass folds value-lists.)
|
||||
"""
|
||||
out: list = []
|
||||
for tok in tokens:
|
||||
if (
|
||||
isinstance(tok, FieldValue)
|
||||
and tok.field in MULTI_VALUE_FIELDS
|
||||
and "," in tok.value
|
||||
):
|
||||
values = tuple(v for v in tok.value.split(",") if v)
|
||||
out.append(FieldValueList(tok.field, values))
|
||||
else:
|
||||
out.append(tok)
|
||||
return out
|
||||
|
||||
|
||||
class SearchQueryError(ValueError):
|
||||
"""
|
||||
Base for user-fixable search query errors.
|
||||
|
||||
Carries a message safe to surface to the user (no internal details). The view
|
||||
layer catches this and returns an HTTP 400, so any future subclass (unknown
|
||||
field, malformed range, wrapped parser errors) gets the same treatment.
|
||||
"""
|
||||
|
||||
|
||||
class InvalidDateQuery(SearchQueryError):
|
||||
"""Raised when a date field value or range bound cannot be parsed."""
|
||||
|
||||
def __init__(self, field: str, value: str) -> None:
|
||||
self.field = field
|
||||
self.value = value
|
||||
super().__init__(f"Invalid date value {value!r} for field {field!r}.")
|
||||
|
||||
|
||||
_DIGITS_RE = regex.compile(r"^\d{4}(?:\d{2}){0,2}$")
|
||||
_ISO_RE = regex.compile(r"^\d{4}(?:-\d{2}(?:-\d{2})?)?$")
|
||||
|
||||
|
||||
def translate_scalar(field: str, value: str, tz: tzinfo) -> str:
|
||||
"""Translate a bare date-field value to a Tantivy range string."""
|
||||
bare = value.strip("\"'").lower()
|
||||
if bare in _DATE_KEYWORDS:
|
||||
if field in _DATE_ONLY_FIELDS:
|
||||
return f"{field}:{_date_only_range(bare, tz)}"
|
||||
return f"{field}:{_datetime_range(bare, tz)}"
|
||||
digits = value.replace("-", "")
|
||||
if _DIGITS_RE.match(value) or _ISO_RE.match(value):
|
||||
bounds = _precision_bounds(digits)
|
||||
if bounds is None:
|
||||
raise InvalidDateQuery(field, value)
|
||||
return _field_range_from_dates(field, bounds[0], bounds[1], tz)
|
||||
if regex.fullmatch(r"\d{14}", value):
|
||||
try:
|
||||
dt = datetime(
|
||||
int(value[0:4]),
|
||||
int(value[4:6]),
|
||||
int(value[6:8]),
|
||||
int(value[8:10]),
|
||||
int(value[10:12]),
|
||||
int(value[12:14]),
|
||||
tzinfo=UTC,
|
||||
)
|
||||
except ValueError:
|
||||
raise InvalidDateQuery(field, value) from None
|
||||
iso = _fmt(dt)
|
||||
return f"{field}:[{iso} TO {iso}]"
|
||||
# Unrecognized shape -> tell the user their date is malformed rather than
|
||||
# silently matching nothing or emitting invalid Tantivy syntax.
|
||||
raise InvalidDateQuery(field, value)
|
||||
|
||||
|
||||
# Open-bound sentinels for date ranges. These far-past/far-future strings allow
|
||||
# open-ended ranges to be expressed as Tantivy string queries until tantivy-py
|
||||
# exposes Query.range_query(..., None) on Date fields (see module TODO).
|
||||
OPEN_LO = "0001-01-01T00:00:00Z"
|
||||
OPEN_HI = "9999-12-31T23:59:59Z"
|
||||
|
||||
|
||||
# Matches compact now-offset tokens like now-7d, now+1h, now-30m.
|
||||
_NOW_COMPACT_RE = regex.compile(
|
||||
r"^now(?P<sign>[+-])(?P<n>\d+)(?P<unit>[dhm])$",
|
||||
regex.IGNORECASE,
|
||||
)
|
||||
|
||||
# Matches "±N <unit>" Whoosh-style offsets (e.g. -7 days, -1 week, +3 hours)
|
||||
# Unit is singular or plural; sign prefix is mandatory.
|
||||
_NOW_SPACED_RE = regex.compile(
|
||||
r"^(?P<sign>[+-])(?P<n>\d+)\s*"
|
||||
r"(?P<unit>second|minute|hour|day|week|month|year)s?$",
|
||||
regex.IGNORECASE,
|
||||
)
|
||||
|
||||
|
||||
def _resolve_relative_bound(token: str) -> datetime | None:
|
||||
"""
|
||||
Resolve a relative bound token to an exact UTC instant, or return None.
|
||||
|
||||
Supported forms:
|
||||
- ``now`` -> current UTC instant
|
||||
- ``now+/-<n>d/h/m`` -> now +/- timedelta (d=days, h=hours, m=minutes)
|
||||
- ``±N <unit>`` -> now +/- delta; month/year use relativedelta
|
||||
"""
|
||||
stripped = token.strip()
|
||||
low = stripped.lower()
|
||||
now = datetime.now(UTC)
|
||||
|
||||
if low == "now":
|
||||
return now
|
||||
|
||||
m = _NOW_COMPACT_RE.match(stripped)
|
||||
if m:
|
||||
sign = 1 if m.group("sign") == "+" else -1
|
||||
n = int(m.group("n"))
|
||||
unit = m.group("unit").lower()
|
||||
delta = (
|
||||
sign
|
||||
* {
|
||||
"d": timedelta(days=n),
|
||||
"h": timedelta(hours=n),
|
||||
"m": timedelta(minutes=n),
|
||||
}[unit]
|
||||
)
|
||||
return now + delta
|
||||
|
||||
m = _NOW_SPACED_RE.match(stripped)
|
||||
if m:
|
||||
sign = 1 if m.group("sign") == "+" else -1
|
||||
n = int(m.group("n"))
|
||||
unit = m.group("unit").lower()
|
||||
delta_map: dict[str, timedelta | relativedelta] = {
|
||||
"second": timedelta(seconds=n),
|
||||
"minute": timedelta(minutes=n),
|
||||
"hour": timedelta(hours=n),
|
||||
"day": timedelta(days=n),
|
||||
"week": timedelta(weeks=n),
|
||||
"month": relativedelta(months=n),
|
||||
"year": relativedelta(years=n),
|
||||
}
|
||||
return now - delta_map[unit] if sign == -1 else now + delta_map[unit]
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _bound_datetimes(
|
||||
field: str,
|
||||
token: str,
|
||||
tz: tzinfo,
|
||||
) -> tuple[datetime, datetime] | None:
|
||||
"""
|
||||
Return (floor_dt, ceil_dt) UTC datetimes for a single range bound token, or
|
||||
None if the token is unparsable. ``now`` and relative offsets resolve to the
|
||||
current instant (floor == ceil == that instant; no day-flooring).
|
||||
"""
|
||||
token = token.strip()
|
||||
|
||||
# Try relative/now forms first (before stripping hyphens which would mangle them).
|
||||
rel = _resolve_relative_bound(token)
|
||||
if rel is not None:
|
||||
return rel, rel
|
||||
|
||||
# Full ISO datetime token (contains "T"): parse directly and return an exact
|
||||
# instant (floor == ceil). Python 3.11+ datetime.fromisoformat accepts trailing Z.
|
||||
if "T" in token:
|
||||
try:
|
||||
dt = datetime.fromisoformat(token)
|
||||
# Ensure timezone-aware UTC result.
|
||||
dt = dt.replace(tzinfo=UTC) if dt.tzinfo is None else dt.astimezone(UTC)
|
||||
return dt, dt
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
digits = token.replace("-", "")
|
||||
bounds = _precision_bounds(digits)
|
||||
if bounds is None:
|
||||
return None
|
||||
start, end = bounds
|
||||
return _utc_bounds_for_field(field, start, end, tz)
|
||||
|
||||
|
||||
def _render(tok: Token, tz: tzinfo) -> str:
|
||||
"""Render a single token back to a Tantivy query string fragment."""
|
||||
if isinstance(tok, Passthrough):
|
||||
return tok.raw
|
||||
if isinstance(tok, Comma):
|
||||
return " AND "
|
||||
if isinstance(tok, FieldValueList):
|
||||
field = FIELD_ALIASES.get(tok.field, tok.field)
|
||||
return " AND ".join(f"{field}:{v}" for v in tok.values)
|
||||
if isinstance(tok, FieldValue):
|
||||
field = FIELD_ALIASES.get(tok.field, tok.field)
|
||||
if field in DATE_FIELDS:
|
||||
return translate_scalar(field, tok.value, tz)
|
||||
return f"{field}:{tok.value}"
|
||||
if isinstance(tok, FieldRange):
|
||||
field = FIELD_ALIASES.get(tok.field, tok.field)
|
||||
if field in DATE_FIELDS:
|
||||
return translate_range(field, tok.lo, tok.hi, tz)
|
||||
return f"{field}:{tok.open}{tok.lo} TO {tok.hi}{tok.close}"
|
||||
return "" # pragma: no cover
|
||||
|
||||
|
||||
# Post-render operator normalization patterns: collapse repeated whitespace and
|
||||
# strip spaced/trailing Tantivy boolean operators that would otherwise be invalid.
|
||||
_MULTI_SPACE_RE = regex.compile(r" {2,}")
|
||||
_TRAILING_OP_RE = regex.compile(r"\s+[-+]+\s*$")
|
||||
_SPACED_OP_RE = regex.compile(r"\s+[-+]\s+")
|
||||
|
||||
|
||||
def _normalize_operators(text: str) -> str:
|
||||
"""
|
||||
Collapse multiple spaces, strip trailing dangling operators, and replace
|
||||
spaced operators (`` - `` / `` + ``) with a single space.
|
||||
|
||||
Applied only to Passthrough fragments (the rendered output is scanned for
|
||||
operator artifacts outside bracketed ranges) via a post-render pass on the
|
||||
full rendered string. This preserves date ranges (``[... TO ...]``) verbatim
|
||||
while cleaning natural-language separators in the surrounding text.
|
||||
"""
|
||||
text = _MULTI_SPACE_RE.sub(" ", text)
|
||||
text = _TRAILING_OP_RE.sub("", text).strip()
|
||||
text = _SPACED_OP_RE.sub(" ", text).strip()
|
||||
return text
|
||||
|
||||
|
||||
def translate_query(raw: str, tz: tzinfo) -> str:
|
||||
"""Translate a raw Whoosh-style query into Tantivy-compatible syntax."""
|
||||
tokens = resolve_commas(scan(raw))
|
||||
rendered = "".join(_render(t, tz) for t in tokens)
|
||||
return _normalize_operators(rendered)
|
||||
|
||||
|
||||
def translate_range(field: str, lo: str, hi: str, tz: tzinfo) -> str:
|
||||
"""Translate a date-field ``[lo TO hi]`` range to a Tantivy ISO range string.
|
||||
|
||||
Handles partial-date bounds (YYYY, YYYYMM, YYYYMMDD, ISO dash variants),
|
||||
open bounds (empty string -> OPEN_LO/OPEN_HI), ``now``, and reversed ranges
|
||||
(swaps tokens before computing floor/ceil so the span is always correct).
|
||||
"""
|
||||
lo_s = lo.strip()
|
||||
hi_s = hi.strip()
|
||||
|
||||
# Parse both bounds to (floor, ceil) pairs when present.
|
||||
lo_pair: tuple[datetime, datetime] | None = None
|
||||
hi_pair: tuple[datetime, datetime] | None = None
|
||||
|
||||
if lo_s:
|
||||
lo_pair = _bound_datetimes(field, lo_s, tz)
|
||||
if lo_pair is None:
|
||||
raise InvalidDateQuery(field, lo_s)
|
||||
if hi_s:
|
||||
hi_pair = _bound_datetimes(field, hi_s, tz)
|
||||
if hi_pair is None:
|
||||
raise InvalidDateQuery(field, hi_s)
|
||||
|
||||
# Detect a reversed range: only swap when BOTH bounds are present.
|
||||
if lo_pair is not None and hi_pair is not None and lo_pair[0] > hi_pair[0]:
|
||||
lo_pair, hi_pair = hi_pair, lo_pair
|
||||
|
||||
lo_iso = _fmt(lo_pair[0]) if lo_pair is not None else OPEN_LO
|
||||
hi_iso = _fmt(hi_pair[1]) if hi_pair is not None else OPEN_HI
|
||||
|
||||
return f"{field}:[{lo_iso} TO {hi_iso}]"
|
||||
@@ -1,11 +1,15 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import tempfile
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import pytest
|
||||
import tantivy
|
||||
|
||||
from documents.search._backend import TantivyBackend
|
||||
from documents.search._backend import reset_backend
|
||||
from documents.search._schema import build_schema
|
||||
from documents.search._tokenizer import register_tokenizers
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Generator
|
||||
@@ -31,3 +35,11 @@ def backend() -> Generator[TantivyBackend, None, None]:
|
||||
finally:
|
||||
b.close()
|
||||
reset_backend()
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def index() -> tantivy.Index:
|
||||
"""A real Tantivy index for parse-acceptance tests (module scope for speed)."""
|
||||
idx = tantivy.Index(build_schema(), path=tempfile.mkdtemp())
|
||||
register_tokenizers(idx, "english")
|
||||
return idx
|
||||
|
||||
@@ -13,7 +13,6 @@ import time_machine
|
||||
|
||||
from documents.search._query import _date_only_range
|
||||
from documents.search._query import _datetime_range
|
||||
from documents.search._query import _rewrite_compact_date
|
||||
from documents.search._query import build_permission_filter
|
||||
from documents.search._query import normalize_query
|
||||
from documents.search._query import parse_simple_text_highlight_query
|
||||
@@ -21,6 +20,7 @@ from documents.search._query import parse_user_query
|
||||
from documents.search._query import rewrite_natural_date_keywords
|
||||
from documents.search._schema import build_schema
|
||||
from documents.search._tokenizer import register_tokenizers
|
||||
from documents.search._translate import InvalidDateQuery
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from django.contrib.auth.base_user import AbstractBaseUser
|
||||
@@ -405,12 +405,14 @@ class TestWhooshQueryRewriting:
|
||||
assert lo == "2023-12-01T05:00:00Z"
|
||||
assert hi == "2023-12-02T05:00:00Z"
|
||||
|
||||
def test_8digit_invalid_date_passes_through_unchanged(self) -> None:
|
||||
assert rewrite_natural_date_keywords("added:20231340", UTC) == "added:20231340"
|
||||
|
||||
def test_compact_14digit_invalid_date_passes_through_unchanged(self) -> None:
|
||||
# Month=13 makes datetime() raise ValueError; the token must be left as-is
|
||||
assert _rewrite_compact_date("20231300120000") == "20231300120000"
|
||||
def test_8digit_invalid_date_raises(self) -> None:
|
||||
# The translation pipeline raises InvalidDateQuery for unparsable dates
|
||||
# (e.g. month=13) so the API can surface a 400 telling the user the date
|
||||
# is malformed instead of silently returning zero results.
|
||||
with pytest.raises(InvalidDateQuery) as exc_info:
|
||||
rewrite_natural_date_keywords("added:20231340", UTC)
|
||||
assert exc_info.value.field == "added"
|
||||
assert exc_info.value.value == "20231340"
|
||||
|
||||
|
||||
class TestParseUserQuery:
|
||||
@@ -463,6 +465,67 @@ class TestParseUserQuery:
|
||||
) -> None:
|
||||
assert isinstance(parse_user_query(query_index, raw_query, UTC), tantivy.Query)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"raw_query",
|
||||
[
|
||||
# Partial date scalar (year only)
|
||||
pytest.param("created:2020", id="created_year_scalar"),
|
||||
# 8-digit compact date range in brackets
|
||||
pytest.param(
|
||||
"created:[20200101 TO 20201231]",
|
||||
id="created_8digit_bracket_range",
|
||||
),
|
||||
# Comma-separated field + date range (Whoosh v2 multi-clause syntax)
|
||||
pytest.param(
|
||||
"title:x,created:[2020 TO 2021]",
|
||||
id="title_comma_created_range",
|
||||
),
|
||||
# Field alias: type -> document_type
|
||||
pytest.param("type:invoice", id="type_alias"),
|
||||
# Multi-word date keyword
|
||||
pytest.param("created:previous week", id="created_previous_week"),
|
||||
# Full ISO datetime range
|
||||
pytest.param(
|
||||
"created:[2026-01-01T00:00:00Z TO 2026-06-01T00:00:00Z]",
|
||||
id="created_iso_range",
|
||||
),
|
||||
# Comma-separated ISO ranges (Whoosh v2 syntax)
|
||||
pytest.param(
|
||||
"created:[2026-01-01T00:00:00Z TO 2026-06-01T00:00:00Z],"
|
||||
"added:[2026-05-01T00:00:00Z TO 2026-06-01T00:00:00Z]",
|
||||
id="comma_iso_ranges",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_advanced_search_queries_do_not_raise(
|
||||
self,
|
||||
query_index: tantivy.Index,
|
||||
raw_query: str,
|
||||
) -> None:
|
||||
"""
|
||||
End-to-end: queries that the frontend sends must parse without raising.
|
||||
|
||||
This tests the full pipeline: translate_query -> tantivy parse_query.
|
||||
Equivalent to asserting HTTP 200 (not 400) for each query form.
|
||||
"""
|
||||
with time_machine.travel(datetime(2026, 6, 15, 12, 0, tzinfo=UTC), tick=False):
|
||||
assert isinstance(
|
||||
parse_user_query(query_index, raw_query, UTC),
|
||||
tantivy.Query,
|
||||
)
|
||||
|
||||
def test_invalid_date_propagates_not_swallowed(
|
||||
self,
|
||||
query_index: tantivy.Index,
|
||||
) -> None:
|
||||
# parse_user_query falls back to the raw query on unexpected translation
|
||||
# errors, but an InvalidDateQuery is intentional and must propagate so the
|
||||
# view can return a 400 instead of silently parsing the raw (invalid) date.
|
||||
with pytest.raises(InvalidDateQuery) as exc_info:
|
||||
parse_user_query(query_index, "created:202023", UTC)
|
||||
assert exc_info.value.field == "created"
|
||||
assert exc_info.value.value == "202023"
|
||||
|
||||
|
||||
class TestYearRangeRewriting:
|
||||
"""Whoosh-style year-only date ranges must be rewritten to ISO 8601."""
|
||||
@@ -542,11 +605,16 @@ class TestYearRangeRewriting:
|
||||
assert rewrite_natural_date_keywords(original, UTC) == original
|
||||
|
||||
def test_8digit_in_brackets_not_matched_as_year_range(self) -> None:
|
||||
# [YYYYMMDD TO YYYYMMDD] has 8-digit values - must not be caught by year rewriter
|
||||
# [YYYYMMDD TO YYYYMMDD]: the translation layer converts 8-digit bounds to
|
||||
# ISO day ranges. 20200101 -> 2020-01-01T00:00:00Z (lo of that day);
|
||||
# 20201231 -> the ceil of Dec 31 = 2021-01-01T00:00:00Z (exclusive end).
|
||||
# This is the correct and accepted behavior: old compact form becomes a
|
||||
# proper Tantivy-parseable ISO range.
|
||||
original = "created:[20200101 TO 20201231]"
|
||||
result = rewrite_natural_date_keywords(original, UTC)
|
||||
assert "20200101" in result or "2020-01-01" in result
|
||||
assert "20201231" in result or "2020-12-31" in result
|
||||
lo, hi = _range(result, "created")
|
||||
assert lo == "2020-01-01T00:00:00Z"
|
||||
assert hi == "2021-01-01T00:00:00Z"
|
||||
|
||||
|
||||
class TestNonDateFieldsNotRewritten:
|
||||
@@ -606,6 +674,16 @@ class TestNormalizeQuery:
|
||||
def test_normalize_expands_comma_separated_tags(self) -> None:
|
||||
assert normalize_query("tag:foo,bar") == "tag:foo AND tag:bar"
|
||||
|
||||
def test_normalize_comma_between_range_expressions(self) -> None:
|
||||
# Comma-separated field range expressions (Whoosh v2 syntax) must be
|
||||
# converted to AND so Tantivy does not receive an invalid comma.
|
||||
q = "created:[2026-01-01T00:00:00Z TO 2026-06-01T00:00:00Z],added:[2026-05-01T00:00:00Z TO 2026-06-01T00:00:00Z]"
|
||||
assert normalize_query(q) == (
|
||||
"created:[2026-01-01T00:00:00Z TO 2026-06-01T00:00:00Z]"
|
||||
" AND "
|
||||
"added:[2026-05-01T00:00:00Z TO 2026-06-01T00:00:00Z]"
|
||||
)
|
||||
|
||||
def test_normalize_expands_three_values(self) -> None:
|
||||
assert normalize_query("tag:foo,bar,baz") == "tag:foo AND tag:bar AND tag:baz"
|
||||
|
||||
|
||||
@@ -0,0 +1,742 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import UTC
|
||||
from datetime import datetime
|
||||
from typing import TYPE_CHECKING
|
||||
from zoneinfo import ZoneInfo
|
||||
|
||||
import pytest
|
||||
import time_machine
|
||||
|
||||
from documents.search._dates import _precision_bounds
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import tantivy
|
||||
from documents.search._query import _FIELD_BOOSTS
|
||||
from documents.search._query import DEFAULT_SEARCH_FIELDS
|
||||
from documents.search._translate import OPEN_HI
|
||||
from documents.search._translate import OPEN_LO
|
||||
from documents.search._translate import Comma
|
||||
from documents.search._translate import FieldRange
|
||||
from documents.search._translate import FieldValue
|
||||
from documents.search._translate import FieldValueList
|
||||
from documents.search._translate import InvalidDateQuery
|
||||
from documents.search._translate import Passthrough
|
||||
from documents.search._translate import resolve_commas
|
||||
from documents.search._translate import scan
|
||||
from documents.search._translate import translate_query
|
||||
from documents.search._translate import translate_range
|
||||
from documents.search._translate import translate_scalar
|
||||
|
||||
|
||||
@pytest.mark.search
|
||||
class TestPrecisionBounds:
|
||||
@pytest.mark.parametrize(
|
||||
("digits", "expected"),
|
||||
[
|
||||
("2020", ((2020, 1, 1), (2021, 1, 1))),
|
||||
("202003", ((2020, 3, 1), (2020, 4, 1))),
|
||||
("202012", ((2020, 12, 1), (2021, 1, 1))),
|
||||
("20200115", ((2020, 1, 15), (2020, 1, 16))),
|
||||
("20201231", ((2020, 12, 31), (2021, 1, 1))),
|
||||
],
|
||||
)
|
||||
def test_valid(self, digits, expected):
|
||||
lo, hi = _precision_bounds(digits)
|
||||
assert (lo.year, lo.month, lo.day) == expected[0]
|
||||
assert (hi.year, hi.month, hi.day) == expected[1]
|
||||
|
||||
@pytest.mark.parametrize("digits", ["202023", "20200230", "20201301", "20", "abcd"])
|
||||
def test_invalid_returns_none(self, digits):
|
||||
assert _precision_bounds(digits) is None
|
||||
|
||||
|
||||
@pytest.mark.search
|
||||
class TestScan:
|
||||
def test_plain_words_are_passthrough(self):
|
||||
assert scan("bank statement") == [Passthrough("bank statement")]
|
||||
|
||||
def test_field_value(self):
|
||||
assert scan("created:2020") == [FieldValue("created", "2020")]
|
||||
|
||||
def test_field_value_in_boolean(self):
|
||||
toks = scan("created:2020 OR foo")
|
||||
assert toks == [
|
||||
FieldValue("created", "2020"),
|
||||
Passthrough(" OR foo"),
|
||||
]
|
||||
|
||||
def test_field_value_in_parens(self):
|
||||
toks = scan("(created:2020 OR foo)")
|
||||
assert toks == [
|
||||
Passthrough("("),
|
||||
FieldValue("created", "2020"),
|
||||
Passthrough(" OR foo)"),
|
||||
]
|
||||
|
||||
def test_quoted_value(self):
|
||||
assert scan('correspondent:"A B"') == [FieldValue("correspondent", '"A B"')]
|
||||
|
||||
def test_field_range(self):
|
||||
assert scan("created:[2020 TO 2021]") == [
|
||||
FieldRange("created", "[", "2020", "2021", "]"),
|
||||
]
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("query", "expected"),
|
||||
[
|
||||
pytest.param(
|
||||
"created:[2020 to]",
|
||||
FieldRange("created", "[", "2020", "", "]"),
|
||||
id="open_upper",
|
||||
),
|
||||
pytest.param(
|
||||
"created:[to 2020]",
|
||||
FieldRange("created", "[", "", "2020", "]"),
|
||||
id="open_lower",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_open_range(self, query, expected):
|
||||
assert scan(query) == [expected]
|
||||
|
||||
def test_comma_inside_range_not_split(self):
|
||||
# No depth-0 comma here; the whole thing is one range token.
|
||||
toks = scan("created:[2020 TO 2021]")
|
||||
assert len(toks) == 1
|
||||
|
||||
# --- Edge-case / regression tests (scan must never raise) ---
|
||||
|
||||
def test_url_is_passthrough(self):
|
||||
# "http" is not a known field; the whole URL must pass through verbatim.
|
||||
assert scan("http://example.com") == [Passthrough("http://example.com")]
|
||||
|
||||
def test_unterminated_quote_is_passthrough(self):
|
||||
# title is a known field but the quoted value has no closing quote;
|
||||
# _consume_value returns None so the whole string falls into passthrough.
|
||||
assert scan('title:"abc') == [Passthrough('title:"abc')]
|
||||
|
||||
def test_unterminated_bracket_is_passthrough(self):
|
||||
# created is a known field but the range bracket is never closed;
|
||||
# _consume_range returns None so the whole string falls into passthrough.
|
||||
assert scan("created:[2020") == [Passthrough("created:[2020")]
|
||||
|
||||
def test_empty_value_at_end_is_passthrough(self):
|
||||
# created is a known field but there is no value after the colon
|
||||
# (_consume_value returns None for start >= n), so passthrough.
|
||||
assert scan("created:") == [Passthrough("created:")]
|
||||
|
||||
def test_value_containing_colon(self):
|
||||
# The bare-word value reader stops at whitespace/paren, not at colon,
|
||||
# so "2020:30" is consumed as a single value token.
|
||||
assert scan("created:2020:30") == [FieldValue("created", "2020:30")]
|
||||
|
||||
def test_comma_followed_by_unconsumable_value_stops(self):
|
||||
# A comma followed by whitespace is neither a value-list continuation nor a
|
||||
# clause separator: the value stops and the comma stays as passthrough.
|
||||
assert scan("tag:foo, bar") == [
|
||||
FieldValue("tag", "foo"),
|
||||
Passthrough(", bar"),
|
||||
]
|
||||
|
||||
def test_bracket_without_to_is_open_upper_bound(self):
|
||||
# A bracketed value with no TO falls back to (value, "") -> open upper bound.
|
||||
assert scan("created:[2020]") == [
|
||||
FieldRange("created", "[", "2020", "", "]"),
|
||||
]
|
||||
|
||||
def test_known_field_name_midword_is_passthrough(self):
|
||||
# A known field name embedded mid-word is not a field token (the
|
||||
# word-boundary guard); the whole run stays passthrough.
|
||||
assert scan("xtag:foo") == [Passthrough("xtag:foo")]
|
||||
|
||||
|
||||
@pytest.mark.search
|
||||
class TestCommaResolution:
|
||||
def test_value_list_multi_value_field(self):
|
||||
toks = resolve_commas(scan("tag:foo,bar"))
|
||||
assert toks == [FieldValueList("tag", ("foo", "bar"))]
|
||||
|
||||
def test_value_list_three(self):
|
||||
toks = resolve_commas(scan("tag_id:1,2,3"))
|
||||
assert toks == [FieldValueList("tag_id", ("1", "2", "3"))]
|
||||
|
||||
def test_text_field_comma_is_literal(self):
|
||||
# correspondent is not multi-value: comma stays inside the value.
|
||||
toks = resolve_commas(scan("correspondent:foo,bar"))
|
||||
assert toks == [FieldValue("correspondent", "foo,bar")]
|
||||
|
||||
def test_clause_separator_before_known_field(self):
|
||||
toks = resolve_commas(scan("tag:foo,type:bar"))
|
||||
assert toks == [FieldValue("tag", "foo"), Comma(), FieldValue("type", "bar")]
|
||||
|
||||
def test_clause_separator_after_range(self):
|
||||
toks = resolve_commas(scan("created:[2020 TO 2021],added:[2022 TO 2023]"))
|
||||
assert toks == [
|
||||
FieldRange("created", "[", "2020", "2021", "]"),
|
||||
Comma(),
|
||||
FieldRange("added", "[", "2022", "2023", "]"),
|
||||
]
|
||||
|
||||
def test_clause_separator_after_quote(self):
|
||||
toks = resolve_commas(scan('correspondent:"A B",created:[2020 TO 2021]'))
|
||||
assert toks == [
|
||||
FieldValue("correspondent", '"A B"'),
|
||||
Comma(),
|
||||
FieldRange("created", "[", "2020", "2021", "]"),
|
||||
]
|
||||
|
||||
def test_url_comma_is_literal_passthrough(self):
|
||||
toks = resolve_commas(scan("http://example.com/a,b"))
|
||||
assert toks == [Passthrough("http://example.com/a,b")]
|
||||
|
||||
def test_non_multi_value_comma_is_literal(self):
|
||||
# title is not in MULTI_VALUE_FIELDS: comma stays inside the value.
|
||||
toks = resolve_commas(scan("title:10,20"))
|
||||
assert toks == [FieldValue("title", "10,20")]
|
||||
|
||||
def test_clause_separator_before_known_date_field(self):
|
||||
# The comma between a bare value and a known date field acts as a
|
||||
# clause separator; both sides survive as distinct tokens.
|
||||
toks = resolve_commas(scan("correspondent:foo,created:[2020 TO 2021]"))
|
||||
assert toks == [
|
||||
FieldValue("correspondent", "foo"),
|
||||
Comma(),
|
||||
FieldRange("created", "[", "2020", "2021", "]"),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.search
|
||||
class TestTranslateScalar:
|
||||
@pytest.mark.parametrize(
|
||||
("field", "value", "expected"),
|
||||
[
|
||||
(
|
||||
"created",
|
||||
"2020",
|
||||
"created:[2020-01-01T00:00:00Z TO 2021-01-01T00:00:00Z]",
|
||||
),
|
||||
(
|
||||
"created",
|
||||
"202003",
|
||||
"created:[2020-03-01T00:00:00Z TO 2020-04-01T00:00:00Z]",
|
||||
),
|
||||
(
|
||||
"created",
|
||||
"20200115",
|
||||
"created:[2020-01-15T00:00:00Z TO 2020-01-16T00:00:00Z]",
|
||||
),
|
||||
(
|
||||
"created",
|
||||
"2020-01-15",
|
||||
"created:[2020-01-15T00:00:00Z TO 2020-01-16T00:00:00Z]",
|
||||
),
|
||||
(
|
||||
"created",
|
||||
"2020-03",
|
||||
"created:[2020-03-01T00:00:00Z TO 2020-04-01T00:00:00Z]",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_partial_and_iso_dates(self, field: str, value: str, expected: str) -> None:
|
||||
assert translate_scalar(field, value, UTC) == expected
|
||||
|
||||
def test_invalid_date_raises(self) -> None:
|
||||
with pytest.raises(InvalidDateQuery) as exc_info:
|
||||
translate_scalar("created", "202023", UTC)
|
||||
assert exc_info.value.field == "created"
|
||||
assert exc_info.value.value == "202023"
|
||||
|
||||
def test_keyword_delegates(self) -> None:
|
||||
# keyword path produces a range; just assert it is a created range
|
||||
out = translate_scalar("created", "today", UTC)
|
||||
assert out.startswith("created:[") and out.endswith("]")
|
||||
|
||||
def test_14digit_compact_datetime(self) -> None:
|
||||
out = translate_scalar("created", "20240115120000", UTC)
|
||||
assert "20240115120000" not in out
|
||||
assert out.startswith("created:")
|
||||
assert out == "created:[2024-01-15T12:00:00Z TO 2024-01-15T12:00:00Z]"
|
||||
|
||||
def test_14digit_invalid_month_raises(self) -> None:
|
||||
with pytest.raises(InvalidDateQuery) as exc_info:
|
||||
translate_scalar("created", "20231300120000", UTC)
|
||||
assert exc_info.value.field == "created"
|
||||
assert exc_info.value.value == "20231300120000"
|
||||
|
||||
def test_unrecognized_value_raises(self) -> None:
|
||||
# A value that is not a keyword, digits, ISO date, or compact timestamp
|
||||
# raises rather than producing invalid Tantivy syntax or silently matching
|
||||
# nothing.
|
||||
with pytest.raises(InvalidDateQuery) as exc_info:
|
||||
translate_scalar("created", "garbage", UTC)
|
||||
assert exc_info.value.field == "created"
|
||||
assert exc_info.value.value == "garbage"
|
||||
|
||||
|
||||
@pytest.mark.search
|
||||
class TestTranslateRange:
|
||||
@pytest.mark.parametrize(
|
||||
("lo", "hi", "expected"),
|
||||
[
|
||||
("2005", "2009", "created:[2005-01-01T00:00:00Z TO 2010-01-01T00:00:00Z]"),
|
||||
(
|
||||
"202001",
|
||||
"202006",
|
||||
"created:[2020-01-01T00:00:00Z TO 2020-07-01T00:00:00Z]",
|
||||
),
|
||||
(
|
||||
"20200101",
|
||||
"20201231",
|
||||
"created:[2020-01-01T00:00:00Z TO 2021-01-01T00:00:00Z]",
|
||||
),
|
||||
(
|
||||
"2020-01-01",
|
||||
"2020-12-31",
|
||||
"created:[2020-01-01T00:00:00Z TO 2021-01-01T00:00:00Z]",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_absolute_ranges(self, lo, hi, expected):
|
||||
assert translate_range("created", lo, hi, UTC) == expected
|
||||
|
||||
def test_reversed_swaps(self):
|
||||
assert translate_range("created", "2009", "2005", UTC) == (
|
||||
"created:[2005-01-01T00:00:00Z TO 2010-01-01T00:00:00Z]"
|
||||
)
|
||||
|
||||
def test_open_upper(self):
|
||||
out = translate_range("created", "2020", "", UTC)
|
||||
assert out == f"created:[2020-01-01T00:00:00Z TO {OPEN_HI}]"
|
||||
|
||||
def test_open_lower(self):
|
||||
out = translate_range("created", "", "2020", UTC)
|
||||
assert out == f"created:[{OPEN_LO} TO 2021-01-01T00:00:00Z]"
|
||||
|
||||
def test_invalid_bound_raises(self):
|
||||
with pytest.raises(InvalidDateQuery) as exc_info:
|
||||
translate_range("created", "202023", "2025", UTC)
|
||||
assert exc_info.value.field == "created"
|
||||
assert exc_info.value.value == "202023"
|
||||
|
||||
def test_invalid_high_bound_raises(self):
|
||||
# Low bound parses, high bound does not -> raise on the high bound.
|
||||
with pytest.raises(InvalidDateQuery) as exc_info:
|
||||
translate_range("created", "2020", "garbage", UTC)
|
||||
assert exc_info.value.field == "created"
|
||||
assert exc_info.value.value == "garbage"
|
||||
|
||||
|
||||
@pytest.mark.search
|
||||
class TestTranslateQuery:
|
||||
@pytest.mark.parametrize(
|
||||
("raw", "expected"),
|
||||
[
|
||||
(
|
||||
"created:2020",
|
||||
"created:[2020-01-01T00:00:00Z TO 2021-01-01T00:00:00Z]",
|
||||
),
|
||||
("tag:foo,bar", "tag:foo AND tag:bar"),
|
||||
# 'type' is a user-facing alias rewritten to 'document_type' (the real schema field)
|
||||
("tag:foo,type:bar", "tag:foo AND document_type:bar"),
|
||||
(
|
||||
"created:[2020 TO 2021],added:[2022 TO 2023]",
|
||||
"created:[2020-01-01T00:00:00Z TO 2022-01-01T00:00:00Z]"
|
||||
" AND "
|
||||
"added:[2022-01-01T00:00:00Z TO 2024-01-01T00:00:00Z]",
|
||||
),
|
||||
# correspondent is not multi-value: comma stays literal inside the value
|
||||
("correspondent:foo,bar", "correspondent:foo,bar"),
|
||||
],
|
||||
)
|
||||
def test_golden(self, raw: str, expected: str) -> None:
|
||||
assert translate_query(raw, UTC) == expected
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"raw",
|
||||
[
|
||||
"created:2020",
|
||||
"created:202003",
|
||||
"created:[20200101 TO 20201231]",
|
||||
"created:[2020-01-01 TO 2020-12-31]",
|
||||
"created:[2020 to]",
|
||||
"created:[to 2020]",
|
||||
"title:x,created:[2020 TO 2021]",
|
||||
"created:2020 OR foo",
|
||||
"(created:2020 OR invoice)",
|
||||
"tag:foo,type:bar",
|
||||
"bank statement",
|
||||
],
|
||||
)
|
||||
def test_parse_acceptance(self, index: tantivy.Index, raw: str) -> None:
|
||||
translated = translate_query(raw, UTC)
|
||||
# Must not raise:
|
||||
index.parse_query(translated, DEFAULT_SEARCH_FIELDS, field_boosts=_FIELD_BOOSTS)
|
||||
|
||||
|
||||
@pytest.mark.search
|
||||
class TestFieldAliasing:
|
||||
"""Whoosh->Tantivy field-name aliasing (type/path -> document_type/storage_path)."""
|
||||
|
||||
def test_type_alias(self) -> None:
|
||||
assert translate_query("type:invoice", UTC) == "document_type:invoice"
|
||||
|
||||
def test_path_alias(self) -> None:
|
||||
assert translate_query("path:/foo/bar", UTC) == "storage_path:/foo/bar"
|
||||
|
||||
def test_type_id_alias(self) -> None:
|
||||
assert translate_query("type_id:5", UTC) == "document_type_id:5"
|
||||
|
||||
def test_path_id_alias(self) -> None:
|
||||
assert translate_query("path_id:7", UTC) == "storage_path_id:7"
|
||||
|
||||
def test_clause_separator_plus_alias(self) -> None:
|
||||
# Comma between known fields acts as AND separator; alias still applied.
|
||||
assert (
|
||||
translate_query("tag:foo,type:bar", UTC) == "tag:foo AND document_type:bar"
|
||||
)
|
||||
|
||||
def test_type_range_alias(self) -> None:
|
||||
# type is not a date field; range passes through verbatim with alias applied.
|
||||
assert (
|
||||
translate_query("type:[2020 TO 2021]", UTC)
|
||||
== "document_type:[2020 TO 2021]"
|
||||
)
|
||||
|
||||
def test_parse_acceptance_type(self, index: tantivy.Index) -> None:
|
||||
# Translated output must be accepted by the real Tantivy parser.
|
||||
translated = translate_query("type:invoice", UTC)
|
||||
index.parse_query(translated, DEFAULT_SEARCH_FIELDS, field_boosts=_FIELD_BOOSTS)
|
||||
|
||||
def test_parse_acceptance_path(self, index: tantivy.Index) -> None:
|
||||
translated = translate_query("path:foo", UTC)
|
||||
index.parse_query(translated, DEFAULT_SEARCH_FIELDS, field_boosts=_FIELD_BOOSTS)
|
||||
|
||||
|
||||
# Freeze time so relative-date tests are deterministic.
|
||||
_FROZEN_NOW = datetime(2026, 3, 28, 12, 0, 0, tzinfo=UTC)
|
||||
|
||||
|
||||
@pytest.mark.search
|
||||
class TestRelativeRanges:
|
||||
"""Relative date-range tokens resolved against a frozen clock."""
|
||||
|
||||
@time_machine.travel(_FROZEN_NOW, tick=False)
|
||||
def test_minus_7_days_to_now(self) -> None:
|
||||
assert translate_query("added:[-7 days to now]", UTC) == (
|
||||
"added:[2026-03-21T12:00:00Z TO 2026-03-28T12:00:00Z]"
|
||||
)
|
||||
|
||||
@time_machine.travel(_FROZEN_NOW, tick=False)
|
||||
def test_minus_1_week_to_now(self) -> None:
|
||||
assert translate_query("added:[-1 week to now]", UTC) == (
|
||||
"added:[2026-03-21T12:00:00Z TO 2026-03-28T12:00:00Z]"
|
||||
)
|
||||
|
||||
@time_machine.travel(_FROZEN_NOW, tick=False)
|
||||
def test_minus_1_month_to_now(self) -> None:
|
||||
assert translate_query("created:[-1 month to now]", UTC) == (
|
||||
"created:[2026-02-28T12:00:00Z TO 2026-03-28T12:00:00Z]"
|
||||
)
|
||||
|
||||
@time_machine.travel(_FROZEN_NOW, tick=False)
|
||||
def test_minus_1_year_to_now(self) -> None:
|
||||
assert translate_query("modified:[-1 year to now]", UTC) == (
|
||||
"modified:[2025-03-28T12:00:00Z TO 2026-03-28T12:00:00Z]"
|
||||
)
|
||||
|
||||
@time_machine.travel(_FROZEN_NOW, tick=False)
|
||||
def test_minus_3_hours_to_now(self) -> None:
|
||||
assert translate_query("added:[-3 hours to now]", UTC) == (
|
||||
"added:[2026-03-28T09:00:00Z TO 2026-03-28T12:00:00Z]"
|
||||
)
|
||||
|
||||
@time_machine.travel(_FROZEN_NOW, tick=False)
|
||||
def test_uppercase_units(self) -> None:
|
||||
assert translate_query("added:[-1 WEEK TO NOW]", UTC) == (
|
||||
"added:[2026-03-21T12:00:00Z TO 2026-03-28T12:00:00Z]"
|
||||
)
|
||||
|
||||
@time_machine.travel(_FROZEN_NOW, tick=False)
|
||||
def test_now_minus_7d_compact(self) -> None:
|
||||
assert translate_query("added:[now-7d TO now]", UTC) == (
|
||||
"added:[2026-03-21T12:00:00Z TO 2026-03-28T12:00:00Z]"
|
||||
)
|
||||
|
||||
@time_machine.travel(_FROZEN_NOW, tick=False)
|
||||
def test_reversed_range_swapped(self) -> None:
|
||||
# now+1h TO now-1h is reversed; translate_range swaps -> lo=now-1h, hi=now+1h
|
||||
assert translate_query("added:[now+1h TO now-1h]", UTC) == (
|
||||
"added:[2026-03-28T11:00:00Z TO 2026-03-28T13:00:00Z]"
|
||||
)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"raw",
|
||||
[
|
||||
"added:[-7 days to now]",
|
||||
"added:[-1 week to now]",
|
||||
"created:[-1 month to now]",
|
||||
"modified:[-1 year to now]",
|
||||
"added:[-3 hours to now]",
|
||||
"added:[now-7d TO now]",
|
||||
"added:[now+1h TO now-1h]",
|
||||
],
|
||||
)
|
||||
@time_machine.travel(_FROZEN_NOW, tick=False)
|
||||
def test_parse_acceptance(self, index: tantivy.Index, raw: str) -> None:
|
||||
translated = translate_query(raw, UTC)
|
||||
index.parse_query(translated, DEFAULT_SEARCH_FIELDS, field_boosts=_FIELD_BOOSTS)
|
||||
|
||||
|
||||
@pytest.mark.search
|
||||
class TestOperatorNormalization:
|
||||
"""Post-render operator normalization in translate_query."""
|
||||
|
||||
def test_spaced_dash_removed(self) -> None:
|
||||
assert (
|
||||
translate_query("H52.1 - Kurzsichtigkeit", UTC) == "H52.1 Kurzsichtigkeit"
|
||||
)
|
||||
|
||||
def test_spaced_dash_simple(self) -> None:
|
||||
assert translate_query("bar - baz", UTC) == "bar baz"
|
||||
|
||||
def test_trailing_operator_stripped(self) -> None:
|
||||
assert translate_query("foo -", UTC) == "foo"
|
||||
|
||||
def test_date_range_preserved(self) -> None:
|
||||
out = translate_query("created:[2020 TO 2021]", UTC)
|
||||
# Must not corrupt the ISO range
|
||||
assert out == "created:[2020-01-01T00:00:00Z TO 2022-01-01T00:00:00Z]"
|
||||
|
||||
def test_date_scalar_with_or(self) -> None:
|
||||
out = translate_query("created:2020 OR foo", UTC)
|
||||
# The created scalar becomes a range; " OR foo" passes through verbatim.
|
||||
assert out.startswith("created:[")
|
||||
assert "OR foo" in out
|
||||
|
||||
def test_parse_acceptance_spaced_dash(self, index: tantivy.Index) -> None:
|
||||
translated = translate_query("H52.1 - Kurzsichtigkeit", UTC)
|
||||
index.parse_query(translated, DEFAULT_SEARCH_FIELDS, field_boosts=_FIELD_BOOSTS)
|
||||
|
||||
def test_parse_acceptance_trailing_op(self, index: tantivy.Index) -> None:
|
||||
translated = translate_query("foo -", UTC)
|
||||
index.parse_query(translated, DEFAULT_SEARCH_FIELDS, field_boosts=_FIELD_BOOSTS)
|
||||
|
||||
|
||||
@pytest.mark.search
|
||||
class TestMultiWordDateKeywords:
|
||||
"""scan() must consume multi-word date keywords as a single value."""
|
||||
|
||||
def test_scan_previous_week_as_single_token(self) -> None:
|
||||
# "created:previous week" must produce one FieldValue with value "previous week",
|
||||
# not FieldValue("created","previous") + Passthrough(" week").
|
||||
toks = scan("created:previous week")
|
||||
assert toks == [FieldValue("created", "previous week")]
|
||||
|
||||
def test_scan_this_month_as_single_token(self) -> None:
|
||||
toks = scan("added:this month")
|
||||
assert toks == [FieldValue("added", "this month")]
|
||||
|
||||
def test_scan_previous_month_as_single_token(self) -> None:
|
||||
toks = scan("created:previous month")
|
||||
assert toks == [FieldValue("created", "previous month")]
|
||||
|
||||
def test_scan_this_year_as_single_token(self) -> None:
|
||||
toks = scan("added:this year")
|
||||
assert toks == [FieldValue("added", "this year")]
|
||||
|
||||
def test_scan_previous_year_as_single_token(self) -> None:
|
||||
toks = scan("created:previous year")
|
||||
assert toks == [FieldValue("created", "previous year")]
|
||||
|
||||
def test_scan_previous_quarter_as_single_token(self) -> None:
|
||||
toks = scan("created:previous quarter")
|
||||
assert toks == [FieldValue("created", "previous quarter")]
|
||||
|
||||
def test_quoted_multi_word_keyword_still_works(self) -> None:
|
||||
# The quoted form must continue to work as before.
|
||||
toks = scan('created:"previous week"')
|
||||
assert toks == [FieldValue("created", '"previous week"')]
|
||||
|
||||
def test_non_date_field_not_affected(self) -> None:
|
||||
# "previous" stops at the space for non-date fields; " week" passes through.
|
||||
toks = scan("correspondent:previous week")
|
||||
assert toks == [
|
||||
FieldValue("correspondent", "previous"),
|
||||
Passthrough(" week"),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.search
|
||||
class TestKeywordDateResolution:
|
||||
"""Relative date keywords resolve to exact ISO ranges against a frozen clock.
|
||||
|
||||
Frozen at 2026-03-28 12:00 UTC (a Saturday in Q1) so the week, month,
|
||||
quarter and year rollovers are all exercised by a single anchor.
|
||||
"""
|
||||
|
||||
# created is a DateField: bounds are UTC midnight, no timezone offset.
|
||||
@pytest.mark.parametrize(
|
||||
("keyword", "expected"),
|
||||
[
|
||||
pytest.param(
|
||||
"today",
|
||||
"created:[2026-03-28T00:00:00Z TO 2026-03-29T00:00:00Z]",
|
||||
id="today",
|
||||
),
|
||||
pytest.param(
|
||||
"yesterday",
|
||||
"created:[2026-03-27T00:00:00Z TO 2026-03-28T00:00:00Z]",
|
||||
id="yesterday",
|
||||
),
|
||||
pytest.param(
|
||||
"previous week",
|
||||
"created:[2026-03-16T00:00:00Z TO 2026-03-23T00:00:00Z]",
|
||||
id="previous-week",
|
||||
),
|
||||
pytest.param(
|
||||
"this month",
|
||||
"created:[2026-03-01T00:00:00Z TO 2026-04-01T00:00:00Z]",
|
||||
id="this-month",
|
||||
),
|
||||
pytest.param(
|
||||
"previous month",
|
||||
"created:[2026-02-01T00:00:00Z TO 2026-03-01T00:00:00Z]",
|
||||
id="previous-month",
|
||||
),
|
||||
pytest.param(
|
||||
"this year",
|
||||
"created:[2026-01-01T00:00:00Z TO 2027-01-01T00:00:00Z]",
|
||||
id="this-year",
|
||||
),
|
||||
pytest.param(
|
||||
"previous year",
|
||||
"created:[2025-01-01T00:00:00Z TO 2026-01-01T00:00:00Z]",
|
||||
id="previous-year",
|
||||
),
|
||||
pytest.param(
|
||||
"previous quarter",
|
||||
"created:[2025-10-01T00:00:00Z TO 2026-01-01T00:00:00Z]",
|
||||
id="previous-quarter",
|
||||
),
|
||||
],
|
||||
)
|
||||
@time_machine.travel(_FROZEN_NOW, tick=False)
|
||||
def test_date_only_field_keyword_ranges(
|
||||
self,
|
||||
keyword: str,
|
||||
expected: str,
|
||||
) -> None:
|
||||
assert translate_query(f"created:{keyword}", UTC) == expected
|
||||
|
||||
# added is a DateTimeField: local-tz midnight converted to UTC. Tokyo
|
||||
# (+09:00, no DST) shifts each midnight boundary back to 15:00Z the day
|
||||
# before, so this also exercises the local-midnight offset path.
|
||||
@pytest.mark.parametrize(
|
||||
("keyword", "expected"),
|
||||
[
|
||||
pytest.param(
|
||||
"today",
|
||||
"added:[2026-03-27T15:00:00Z TO 2026-03-28T15:00:00Z]",
|
||||
id="today",
|
||||
),
|
||||
pytest.param(
|
||||
"yesterday",
|
||||
"added:[2026-03-26T15:00:00Z TO 2026-03-27T15:00:00Z]",
|
||||
id="yesterday",
|
||||
),
|
||||
pytest.param(
|
||||
"previous week",
|
||||
"added:[2026-03-15T15:00:00Z TO 2026-03-22T15:00:00Z]",
|
||||
id="previous-week",
|
||||
),
|
||||
pytest.param(
|
||||
"this month",
|
||||
"added:[2026-02-28T15:00:00Z TO 2026-03-31T15:00:00Z]",
|
||||
id="this-month",
|
||||
),
|
||||
pytest.param(
|
||||
"previous month",
|
||||
"added:[2026-01-31T15:00:00Z TO 2026-02-28T15:00:00Z]",
|
||||
id="previous-month",
|
||||
),
|
||||
pytest.param(
|
||||
"this year",
|
||||
"added:[2025-12-31T15:00:00Z TO 2026-12-31T15:00:00Z]",
|
||||
id="this-year",
|
||||
),
|
||||
pytest.param(
|
||||
"previous year",
|
||||
"added:[2024-12-31T15:00:00Z TO 2025-12-31T15:00:00Z]",
|
||||
id="previous-year",
|
||||
),
|
||||
pytest.param(
|
||||
"previous quarter",
|
||||
"added:[2025-09-30T15:00:00Z TO 2025-12-31T15:00:00Z]",
|
||||
id="previous-quarter",
|
||||
),
|
||||
],
|
||||
)
|
||||
@time_machine.travel(_FROZEN_NOW, tick=False)
|
||||
def test_datetime_field_keyword_ranges_local_tz(
|
||||
self,
|
||||
keyword: str,
|
||||
expected: str,
|
||||
) -> None:
|
||||
assert translate_query(f"added:{keyword}", ZoneInfo("Asia/Tokyo")) == expected
|
||||
|
||||
|
||||
@pytest.mark.search
|
||||
class TestISODatetimeBounds:
|
||||
"""Full ISO datetime tokens in range bounds must be parsed directly."""
|
||||
|
||||
def test_translate_range_iso_bounds_passthrough(self) -> None:
|
||||
# Already-ISO datetime bounds must pass through as-is (exact instant).
|
||||
result = translate_range(
|
||||
"created",
|
||||
"2020-01-01T00:00:00Z",
|
||||
"2021-01-01T00:00:00Z",
|
||||
UTC,
|
||||
)
|
||||
assert result == "created:[2020-01-01T00:00:00Z TO 2021-01-01T00:00:00Z]"
|
||||
|
||||
def test_translate_query_iso_range_preserved(self) -> None:
|
||||
q = "created:[2026-01-01T00:00:00Z TO 2026-06-01T00:00:00Z]"
|
||||
assert translate_query(q, UTC) == q
|
||||
|
||||
def test_translate_query_comma_separated_iso_ranges(self) -> None:
|
||||
q = (
|
||||
"created:[2026-01-01T00:00:00Z TO 2026-06-01T00:00:00Z],"
|
||||
"added:[2026-05-01T00:00:00Z TO 2026-06-01T00:00:00Z]"
|
||||
)
|
||||
result = translate_query(q, UTC)
|
||||
assert result == (
|
||||
"created:[2026-01-01T00:00:00Z TO 2026-06-01T00:00:00Z]"
|
||||
" AND "
|
||||
"added:[2026-05-01T00:00:00Z TO 2026-06-01T00:00:00Z]"
|
||||
)
|
||||
|
||||
def test_invalid_iso_datetime_raises(self) -> None:
|
||||
# A token with "T" that is not valid ISO datetime -> raise.
|
||||
with pytest.raises(InvalidDateQuery) as exc_info:
|
||||
translate_range(
|
||||
"created",
|
||||
"2020-01-01T99:00:00Z",
|
||||
"2021-01-01T00:00:00Z",
|
||||
UTC,
|
||||
)
|
||||
assert exc_info.value.field == "created"
|
||||
assert exc_info.value.value == "2020-01-01T99:00:00Z"
|
||||
|
||||
def test_parse_acceptance_iso_bounds(self, index: tantivy.Index) -> None:
|
||||
q = "created:[2026-01-01T00:00:00Z TO 2026-06-01T00:00:00Z]"
|
||||
translated = translate_query(q, UTC)
|
||||
index.parse_query(translated, DEFAULT_SEARCH_FIELDS, field_boosts=_FIELD_BOOSTS)
|
||||
|
||||
def test_parse_acceptance_comma_iso_ranges(self, index: tantivy.Index) -> None:
|
||||
q = (
|
||||
"created:[2026-01-01T00:00:00Z TO 2026-06-01T00:00:00Z],"
|
||||
"added:[2026-05-01T00:00:00Z TO 2026-06-01T00:00:00Z]"
|
||||
)
|
||||
translated = translate_query(q, UTC)
|
||||
index.parse_query(translated, DEFAULT_SEARCH_FIELDS, field_boosts=_FIELD_BOOSTS)
|
||||
@@ -725,9 +725,11 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
|
||||
GIVEN:
|
||||
- One document added right now
|
||||
WHEN:
|
||||
- Query with invalid added date
|
||||
- Query with an invalid added date
|
||||
THEN:
|
||||
- 400 Bad Request returned (Tantivy rejects invalid date field syntax)
|
||||
- 400 Bad Request with a message naming the malformed date, so the
|
||||
user knows their date is invalid rather than silently getting zero
|
||||
results
|
||||
"""
|
||||
d1 = Document.objects.create(
|
||||
title="invoice",
|
||||
@@ -740,8 +742,9 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
|
||||
|
||||
response = self.client.get("/api/documents/?query=added:invalid-date")
|
||||
|
||||
# Tantivy rejects unparsable field queries with a 400
|
||||
# An unparsable date is reported as a malformed query, not silently empty.
|
||||
self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST)
|
||||
self.assertIn("invalid-date", str(response.data["query"]))
|
||||
|
||||
@override_settings(
|
||||
TIME_ZONE="UTC",
|
||||
|
||||
@@ -2277,6 +2277,7 @@ class UnifiedSearchViewSet(DocumentViewSet):
|
||||
return super().list(request)
|
||||
|
||||
from documents.search import SearchHit
|
||||
from documents.search import SearchQueryError
|
||||
from documents.search import TantivyBackend
|
||||
from documents.search import TantivyRelevanceList
|
||||
from documents.search import get_backend
|
||||
@@ -2469,6 +2470,11 @@ class UnifiedSearchViewSet(DocumentViewSet):
|
||||
return HttpResponseForbidden(_("Insufficient permissions."))
|
||||
except ValidationError:
|
||||
raise
|
||||
except SearchQueryError as e:
|
||||
# User-fixable query error (e.g. an unparsable date): surface the
|
||||
# specific message so the user can correct it, rather than a generic
|
||||
# 400 or silently empty results.
|
||||
raise ValidationError({"query": [str(e)]}) from e
|
||||
except Exception as e:
|
||||
logger.warning(f"An error occurred listing search results: {e!s}")
|
||||
return HttpResponseBadRequest(
|
||||
|
||||
Reference in New Issue
Block a user