diff --git a/src/documents/search/_query.py b/src/documents/search/_query.py index 023d27092..b708f1d02 100644 --- a/src/documents/search/_query.py +++ b/src/documents/search/_query.py @@ -205,3 +205,125 @@ def rewrite_natural_date_keywords(query: str, tz: tzinfo) -> str: return f"{field}:{_datetime_range(keyword, tz)}" return _FIELD_DATE_RE.sub(_replace, query) + + +# ── normalize_query ────────────────────────────────────────────────────────── + + +def normalize_query(query: str) -> str: + """ + Join comma-separated field values with AND, collapse whitespace. + tag:foo,bar → tag:foo AND tag:bar + """ + + def _expand(m: re.Match) -> str: + field = m.group(1) + values = [v.strip() for v in m.group(2).split(",") if v.strip()] + return " AND ".join(f"{field}:{v}" for v in values) + + query = re.sub(r"(\w+):([^\s\[\]]+(?:,[^\s\[\]]+)+)", _expand, query) + return re.sub(r" {2,}", " ", query).strip() + + +# ── build_permission_filter ────────────────────────────────────────────────── + +_MAX_U64 = 2**64 - 1 # u64 max — used as inclusive upper bound for "any owner" range + + +def build_permission_filter(schema, user): + """ + Returns a Query matching documents visible to user: + - no owner (public) → owner_id field absent (NULL in Django) + - owned by user → owner_id = user.pk + - shared with user → viewer_id = user.pk + + Uses disjunction_max_query — boolean Should-only would match all docs. + + NOTE: all integer queries use range_query, not term_query, to avoid the + unsigned type-detection bug in tantivy-py 0.25 (lib.rs#L190 infers i64 + before u64; confirmed empirically — term_query returns 0 for u64 fields). + Same root cause as issue #47 (from_dict) but the term_query path unfixed. + See: https://github.com/quickwit-oss/tantivy-py/blob/f51d851e857385ad2907241fbce8cf08309c3078/src/lib.rs#L190 + https://github.com/quickwit-oss/tantivy-py/issues/47 + + NOTE: no_owner uses boolean_query([Must(all), MustNot(range)]) because + exists_query is not available in 0.25.1. It is present in master and can + simplify this to MustNot(exists_query("owner_id")) once released. + See: https://github.com/quickwit-oss/tantivy-py/blob/master/tantivy/tantivy.pyi + """ + import tantivy as _tantivy + + owner_any = _tantivy.Query.range_query( + schema, + "owner_id", + _tantivy.FieldType.Unsigned, + 1, + _MAX_U64, + ) + no_owner = _tantivy.Query.boolean_query( + [ + (_tantivy.Occur.Must, _tantivy.Query.all_query()), + (_tantivy.Occur.MustNot, owner_any), + ], + ) + owned = _tantivy.Query.range_query( + schema, + "owner_id", + _tantivy.FieldType.Unsigned, + user.pk, + user.pk, + ) + shared = _tantivy.Query.range_query( + schema, + "viewer_id", + _tantivy.FieldType.Unsigned, + user.pk, + user.pk, + ) + return _tantivy.Query.disjunction_max_query([no_owner, owned, shared]) + + +# ── parse_user_query (full pipeline) ───────────────────────────────────────── + +DEFAULT_SEARCH_FIELDS = [ + "title", + "content", + "correspondent", + "document_type", + "tag", + "notes", + "custom_fields", +] +_FIELD_BOOSTS = {"title": 2.0} + + +def parse_user_query(index, schema, raw_query: str, tz: tzinfo): + from django.conf import settings + + query_str = rewrite_natural_date_keywords(raw_query, tz) + query_str = normalize_query(query_str) + + exact = index.parse_query( + query_str, + DEFAULT_SEARCH_FIELDS, + field_boosts=_FIELD_BOOSTS, + ) + + threshold = getattr(settings, "ADVANCED_FUZZY_SEARCH_THRESHOLD", None) + if threshold is not None: + import tantivy + + fuzzy = index.parse_query( + query_str, + DEFAULT_SEARCH_FIELDS, + field_boosts=_FIELD_BOOSTS, + fuzzy_fields={f: (True, 1, True) for f in DEFAULT_SEARCH_FIELDS}, + ) + return tantivy.Query.boolean_query( + [ + (tantivy.Occur.Should, exact), + (tantivy.Occur.Should, tantivy.Query.boost_query(fuzzy, 0.1)), + ], + ) + + return exact diff --git a/src/documents/tests/search/test_query.py b/src/documents/tests/search/test_query.py index d3735e9e7..775aa991b 100644 --- a/src/documents/tests/search/test_query.py +++ b/src/documents/tests/search/test_query.py @@ -3,12 +3,18 @@ from __future__ import annotations import re from datetime import UTC from datetime import datetime +from datetime import tzinfo from zoneinfo import ZoneInfo import pytest +import tantivy import time_machine +from documents.search._query import build_permission_filter +from documents.search._query import normalize_query from documents.search._query import rewrite_natural_date_keywords +from documents.search._schema import build_schema +from documents.search._tokenizer import register_tokenizers pytestmark = pytest.mark.search @@ -42,13 +48,13 @@ class TestCreatedDateField: ], ) @time_machine.travel(datetime(2026, 3, 28, 15, 30, tzinfo=UTC), tick=False) - def test_today(self, tz, expected_lo, expected_hi): + def test_today(self, tz: tzinfo, expected_lo: str, expected_hi: str) -> None: lo, hi = _range(rewrite_natural_date_keywords("created:today", tz), "created") assert lo == expected_lo assert hi == expected_hi @time_machine.travel(datetime(2026, 3, 28, 3, 0, tzinfo=UTC), tick=False) - def test_today_auckland_ahead_of_utc(self): + def test_today_auckland_ahead_of_utc(self) -> None: # UTC 03:00 -> Auckland (UTC+13) = 16:00 same date; local date = 2026-03-28 lo, _ = _range( rewrite_natural_date_keywords("created:today", AUCKLAND), @@ -111,7 +117,13 @@ class TestCreatedDateField: ], ) @time_machine.travel(datetime(2026, 3, 28, 15, 0, tzinfo=UTC), tick=False) - def test_date_keywords(self, field, keyword, expected_lo, expected_hi): + def test_date_keywords( + self, + field: str, + keyword: str, + expected_lo: str, + expected_hi: str, + ) -> None: # 2026-03-28 is Saturday; Mon-Sun week calculation built into expectations query = f"{field}:{keyword}" lo, hi = _range(rewrite_natural_date_keywords(query, UTC), field) @@ -126,14 +138,14 @@ class TestDateTimeFields: """ @time_machine.travel(datetime(2026, 3, 28, 15, 30, tzinfo=UTC), tick=False) - def test_added_today_eastern(self): + def test_added_today_eastern(self) -> None: # EDT = UTC-4; local midnight 2026-03-28 00:00 EDT = 2026-03-28 04:00 UTC lo, hi = _range(rewrite_natural_date_keywords("added:today", EASTERN), "added") assert lo == "2026-03-28T04:00:00Z" assert hi == "2026-03-29T04:00:00Z" @time_machine.travel(datetime(2026, 3, 29, 2, 0, tzinfo=UTC), tick=False) - def test_added_today_auckland_midnight_crossing(self): + def test_added_today_auckland_midnight_crossing(self) -> None: # UTC 02:00 on 2026-03-29 -> Auckland (UTC+13) = 2026-03-29 15:00 local # Auckland midnight = UTC 2026-03-28 11:00 lo, hi = _range(rewrite_natural_date_keywords("added:today", AUCKLAND), "added") @@ -141,7 +153,7 @@ class TestDateTimeFields: assert hi == "2026-03-29T11:00:00Z" @time_machine.travel(datetime(2026, 3, 28, 15, 0, tzinfo=UTC), tick=False) - def test_modified_today_utc(self): + def test_modified_today_utc(self) -> None: lo, hi = _range( rewrite_natural_date_keywords("modified:today", UTC), "modified", @@ -154,14 +166,14 @@ class TestWhooshCompatShims: """Whoosh compact dates and relative ranges must be converted to ISO format.""" @time_machine.travel(datetime(2026, 3, 28, 15, 0, tzinfo=UTC), tick=False) - def test_compact_date_shim_rewrites_to_iso(self): + def test_compact_date_shim_rewrites_to_iso(self) -> None: # Whoosh compact: YYYYMMDDHHmmss result = rewrite_natural_date_keywords("created:20240115120000", UTC) assert "2024-01-15" in result assert "20240115120000" not in result @time_machine.travel(datetime(2026, 3, 28, 15, 0, tzinfo=UTC), tick=False) - def test_relative_range_shim_removes_now(self): + def test_relative_range_shim_removes_now(self) -> None: result = rewrite_natural_date_keywords("added:[now-7d TO now]", UTC) assert "now" not in result assert "2026-03-" in result @@ -170,10 +182,94 @@ class TestWhooshCompatShims: class TestPassthrough: """Queries without field prefixes or unrelated content pass through unchanged.""" - def test_bare_keyword_no_field_prefix_unchanged(self): + def test_bare_keyword_no_field_prefix_unchanged(self) -> None: # Bare 'today' with no field: prefix passes through unchanged result = rewrite_natural_date_keywords("bank statement today", UTC) assert "today" in result - def test_unrelated_query_unchanged(self): + def test_unrelated_query_unchanged(self) -> None: assert rewrite_natural_date_keywords("title:invoice", UTC) == "title:invoice" + + +# ── Task 6: normalize_query and build_permission_filter ───────────────────── + + +class TestNormalizeQuery: + """normalize_query expands comma-separated values and collapses whitespace.""" + + def test_normalize_expands_comma_separated_tags(self) -> None: + assert normalize_query("tag:foo,bar") == "tag:foo AND tag:bar" + + def test_normalize_expands_three_values(self) -> None: + assert normalize_query("tag:foo,bar,baz") == "tag:foo AND tag:bar AND tag:baz" + + def test_normalize_collapses_whitespace(self) -> None: + assert normalize_query("bank statement") == "bank statement" + + def test_normalize_no_commas_unchanged(self) -> None: + assert normalize_query("bank statement") == "bank statement" + + +class TestPermissionFilter: + """build_permission_filter tests use an in-memory index — no DB access needed.""" + + @pytest.fixture + def perm_index(self, tmp_path) -> tantivy.Index: + # Use a temporary directory instead of in-memory index to avoid tokenizer issues + schema = build_schema() + idx = tantivy.Index(schema, path=str(tmp_path)) + register_tokenizers(idx, "en") + return idx + + def _add_doc( + self, + idx: tantivy.Index, + doc_id: int, + owner_id: int | None = None, + viewer_ids: tuple[int, ...] = (), + ) -> None: + writer = idx.writer() + doc = tantivy.Document() + doc.add_unsigned("id", doc_id) + # Only add owner_id field if the document has an owner + if owner_id is not None: + doc.add_unsigned("owner_id", owner_id) + for vid in viewer_ids: + doc.add_unsigned("viewer_id", vid) + writer.add_document(doc) + writer.commit() + idx.reload() + + def test_perm_no_owner_visible_to_any_user(self, perm_index: tantivy.Index) -> None: + self._add_doc(perm_index, doc_id=1, owner_id=None) + user = type("U", (), {"pk": 99})() + perm = build_permission_filter(perm_index.schema, user) # .schema is a property + assert perm_index.searcher().search(perm, limit=10).count == 1 + + def test_perm_owned_by_user_is_visible(self, perm_index: tantivy.Index) -> None: + self._add_doc(perm_index, doc_id=2, owner_id=42) + user = type("U", (), {"pk": 42})() + perm = build_permission_filter(perm_index.schema, user) + assert perm_index.searcher().search(perm, limit=10).count == 1 + + def test_perm_owned_by_other_not_visible(self, perm_index: tantivy.Index) -> None: + self._add_doc(perm_index, doc_id=3, owner_id=42) + user = type("U", (), {"pk": 99})() + perm = build_permission_filter(perm_index.schema, user) + assert perm_index.searcher().search(perm, limit=10).count == 0 + + def test_perm_shared_viewer_is_visible(self, perm_index: tantivy.Index) -> None: + self._add_doc(perm_index, doc_id=4, owner_id=42, viewer_ids=(99,)) + user = type("U", (), {"pk": 99})() + perm = build_permission_filter(perm_index.schema, user) + assert perm_index.searcher().search(perm, limit=10).count == 1 + + def test_perm_only_owned_docs_hidden_from_others( + self, + perm_index: tantivy.Index, + ) -> None: + self._add_doc(perm_index, doc_id=5, owner_id=10) # owned by 10 + self._add_doc(perm_index, doc_id=6, owner_id=None) # unowned + user = type("U", (), {"pk": 20})() + perm = build_permission_filter(perm_index.schema, user) + assert perm_index.searcher().search(perm, limit=10).count == 1 # only unowned