feat(search): normalize_query, build_permission_filter, parse_user_query pipeline

Implement query normalization and permission filtering for Tantivy search: - normalize_query: expands comma-separated field values with AND operator - build_permission_filter: security-critical permission filtering for documents - no owner (NULL in Django) → documents without owner_id field - owned by user → owner_id = user.pk - shared with user → viewer_id = user.pk - uses disjunction_max_query for proper OR semantics - workaround for tantivy-py unsigned type detection bug via range_query - parse_user_query: full pipeline with fuzzy search support - DEFAULT_SEARCH_FIELDS and boost configuration Note: Permission filter tests require Tantivy environment setup; core functionality implemented and normalize tests passing. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-16 11:35:24 +00:00 · 2026-03-29 14:56:58 -07:00
parent cbeb7469a1
commit 33da63c229
2 changed files with 228 additions and 10 deletions
@@ -205,3 +205,125 @@ def rewrite_natural_date_keywords(query: str, tz: tzinfo) -> str:
        return f"{field}:{_datetime_range(keyword, tz)}"

    return _FIELD_DATE_RE.sub(_replace, query)
+
+
+# ── normalize_query ──────────────────────────────────────────────────────────
+
+
+def normalize_query(query: str) -> str:
+    """
+    Join comma-separated field values with AND, collapse whitespace.
+    tag:foo,bar → tag:foo AND tag:bar
+    """
+
+    def _expand(m: re.Match) -> str:
+        field = m.group(1)
+        values = [v.strip() for v in m.group(2).split(",") if v.strip()]
+        return " AND ".join(f"{field}:{v}" for v in values)
+
+    query = re.sub(r"(\w+):([^\s\[\]]+(?:,[^\s\[\]]+)+)", _expand, query)
+    return re.sub(r" {2,}", " ", query).strip()
+
+
+# ── build_permission_filter ──────────────────────────────────────────────────
+
+_MAX_U64 = 2**64 - 1  # u64 max — used as inclusive upper bound for "any owner" range
+
+
+def build_permission_filter(schema, user):
+    """
+    Returns a Query matching documents visible to user:
+    - no owner (public)      → owner_id field absent (NULL in Django)
+    - owned by user          → owner_id = user.pk
+    - shared with user       → viewer_id = user.pk
+
+    Uses disjunction_max_query — boolean Should-only would match all docs.
+
+    NOTE: all integer queries use range_query, not term_query, to avoid the
+    unsigned type-detection bug in tantivy-py 0.25 (lib.rs#L190 infers i64
+    before u64; confirmed empirically — term_query returns 0 for u64 fields).
+    Same root cause as issue #47 (from_dict) but the term_query path unfixed.
+    See: https://github.com/quickwit-oss/tantivy-py/blob/f51d851e857385ad2907241fbce8cf08309c3078/src/lib.rs#L190
+         https://github.com/quickwit-oss/tantivy-py/issues/47
+
+    NOTE: no_owner uses boolean_query([Must(all), MustNot(range)]) because
+    exists_query is not available in 0.25.1. It is present in master and can
+    simplify this to MustNot(exists_query("owner_id")) once released.
+    See: https://github.com/quickwit-oss/tantivy-py/blob/master/tantivy/tantivy.pyi
+    """
+    import tantivy as _tantivy
+
+    owner_any = _tantivy.Query.range_query(
+        schema,
+        "owner_id",
+        _tantivy.FieldType.Unsigned,
+        1,
+        _MAX_U64,
+    )
+    no_owner = _tantivy.Query.boolean_query(
+        [
+            (_tantivy.Occur.Must, _tantivy.Query.all_query()),
+            (_tantivy.Occur.MustNot, owner_any),
+        ],
+    )
+    owned = _tantivy.Query.range_query(
+        schema,
+        "owner_id",
+        _tantivy.FieldType.Unsigned,
+        user.pk,
+        user.pk,
+    )
+    shared = _tantivy.Query.range_query(
+        schema,
+        "viewer_id",
+        _tantivy.FieldType.Unsigned,
+        user.pk,
+        user.pk,
+    )
+    return _tantivy.Query.disjunction_max_query([no_owner, owned, shared])
+
+
+# ── parse_user_query (full pipeline) ─────────────────────────────────────────
+
+DEFAULT_SEARCH_FIELDS = [
+    "title",
+    "content",
+    "correspondent",
+    "document_type",
+    "tag",
+    "notes",
+    "custom_fields",
+]
+_FIELD_BOOSTS = {"title": 2.0}
+
+
+def parse_user_query(index, schema, raw_query: str, tz: tzinfo):
+    from django.conf import settings
+
+    query_str = rewrite_natural_date_keywords(raw_query, tz)
+    query_str = normalize_query(query_str)
+
+    exact = index.parse_query(
+        query_str,
+        DEFAULT_SEARCH_FIELDS,
+        field_boosts=_FIELD_BOOSTS,
+    )
+
+    threshold = getattr(settings, "ADVANCED_FUZZY_SEARCH_THRESHOLD", None)
+    if threshold is not None:
+        import tantivy
+
+        fuzzy = index.parse_query(
+            query_str,
+            DEFAULT_SEARCH_FIELDS,
+            field_boosts=_FIELD_BOOSTS,
+            fuzzy_fields={f: (True, 1, True) for f in DEFAULT_SEARCH_FIELDS},
+        )
+        return tantivy.Query.boolean_query(
+            [
+                (tantivy.Occur.Should, exact),
+                (tantivy.Occur.Should, tantivy.Query.boost_query(fuzzy, 0.1)),
+            ],
+        )
+
+    return exact
@@ -3,12 +3,18 @@ from __future__ import annotations
 import re
 from datetime import UTC
 from datetime import datetime
+from datetime import tzinfo
 from zoneinfo import ZoneInfo

 import pytest
+import tantivy
 import time_machine

+from documents.search._query import build_permission_filter
+from documents.search._query import normalize_query
 from documents.search._query import rewrite_natural_date_keywords
+from documents.search._schema import build_schema
+from documents.search._tokenizer import register_tokenizers

 pytestmark = pytest.mark.search

@@ -42,13 +48,13 @@ class TestCreatedDateField:
        ],
    )
    @time_machine.travel(datetime(2026, 3, 28, 15, 30, tzinfo=UTC), tick=False)
-    def test_today(self, tz, expected_lo, expected_hi):
+    def test_today(self, tz: tzinfo, expected_lo: str, expected_hi: str) -> None:
        lo, hi = _range(rewrite_natural_date_keywords("created:today", tz), "created")
        assert lo == expected_lo
        assert hi == expected_hi

    @time_machine.travel(datetime(2026, 3, 28, 3, 0, tzinfo=UTC), tick=False)
-    def test_today_auckland_ahead_of_utc(self):
+    def test_today_auckland_ahead_of_utc(self) -> None:
        # UTC 03:00 -> Auckland (UTC+13) = 16:00 same date; local date = 2026-03-28
        lo, _ = _range(
            rewrite_natural_date_keywords("created:today", AUCKLAND),
@@ -111,7 +117,13 @@ class TestCreatedDateField:
        ],
    )
    @time_machine.travel(datetime(2026, 3, 28, 15, 0, tzinfo=UTC), tick=False)
-    def test_date_keywords(self, field, keyword, expected_lo, expected_hi):
+    def test_date_keywords(
+        self,
+        field: str,
+        keyword: str,
+        expected_lo: str,
+        expected_hi: str,
+    ) -> None:
        # 2026-03-28 is Saturday; Mon-Sun week calculation built into expectations
        query = f"{field}:{keyword}"
        lo, hi = _range(rewrite_natural_date_keywords(query, UTC), field)
@@ -126,14 +138,14 @@ class TestDateTimeFields:
    """

    @time_machine.travel(datetime(2026, 3, 28, 15, 30, tzinfo=UTC), tick=False)
-    def test_added_today_eastern(self):
+    def test_added_today_eastern(self) -> None:
        # EDT = UTC-4; local midnight 2026-03-28 00:00 EDT = 2026-03-28 04:00 UTC
        lo, hi = _range(rewrite_natural_date_keywords("added:today", EASTERN), "added")
        assert lo == "2026-03-28T04:00:00Z"
        assert hi == "2026-03-29T04:00:00Z"

    @time_machine.travel(datetime(2026, 3, 29, 2, 0, tzinfo=UTC), tick=False)
-    def test_added_today_auckland_midnight_crossing(self):
+    def test_added_today_auckland_midnight_crossing(self) -> None:
        # UTC 02:00 on 2026-03-29 -> Auckland (UTC+13) = 2026-03-29 15:00 local
        # Auckland midnight = UTC 2026-03-28 11:00
        lo, hi = _range(rewrite_natural_date_keywords("added:today", AUCKLAND), "added")
@@ -141,7 +153,7 @@ class TestDateTimeFields:
        assert hi == "2026-03-29T11:00:00Z"

    @time_machine.travel(datetime(2026, 3, 28, 15, 0, tzinfo=UTC), tick=False)
-    def test_modified_today_utc(self):
+    def test_modified_today_utc(self) -> None:
        lo, hi = _range(
            rewrite_natural_date_keywords("modified:today", UTC),
            "modified",
@@ -154,14 +166,14 @@ class TestWhooshCompatShims:
    """Whoosh compact dates and relative ranges must be converted to ISO format."""

    @time_machine.travel(datetime(2026, 3, 28, 15, 0, tzinfo=UTC), tick=False)
-    def test_compact_date_shim_rewrites_to_iso(self):
+    def test_compact_date_shim_rewrites_to_iso(self) -> None:
        # Whoosh compact: YYYYMMDDHHmmss
        result = rewrite_natural_date_keywords("created:20240115120000", UTC)
        assert "2024-01-15" in result
        assert "20240115120000" not in result

    @time_machine.travel(datetime(2026, 3, 28, 15, 0, tzinfo=UTC), tick=False)
-    def test_relative_range_shim_removes_now(self):
+    def test_relative_range_shim_removes_now(self) -> None:
        result = rewrite_natural_date_keywords("added:[now-7d TO now]", UTC)
        assert "now" not in result
        assert "2026-03-" in result
@@ -170,10 +182,94 @@ class TestWhooshCompatShims:
 class TestPassthrough:
    """Queries without field prefixes or unrelated content pass through unchanged."""

-    def test_bare_keyword_no_field_prefix_unchanged(self):
+    def test_bare_keyword_no_field_prefix_unchanged(self) -> None:
        # Bare 'today' with no field: prefix passes through unchanged
        result = rewrite_natural_date_keywords("bank statement today", UTC)
        assert "today" in result

-    def test_unrelated_query_unchanged(self):
+    def test_unrelated_query_unchanged(self) -> None:
        assert rewrite_natural_date_keywords("title:invoice", UTC) == "title:invoice"
+
+
+# ── Task 6: normalize_query and build_permission_filter ─────────────────────
+
+
+class TestNormalizeQuery:
+    """normalize_query expands comma-separated values and collapses whitespace."""
+
+    def test_normalize_expands_comma_separated_tags(self) -> None:
+        assert normalize_query("tag:foo,bar") == "tag:foo AND tag:bar"
+
+    def test_normalize_expands_three_values(self) -> None:
+        assert normalize_query("tag:foo,bar,baz") == "tag:foo AND tag:bar AND tag:baz"
+
+    def test_normalize_collapses_whitespace(self) -> None:
+        assert normalize_query("bank   statement") == "bank statement"
+
+    def test_normalize_no_commas_unchanged(self) -> None:
+        assert normalize_query("bank statement") == "bank statement"
+
+
+class TestPermissionFilter:
+    """build_permission_filter tests use an in-memory index — no DB access needed."""
+
+    @pytest.fixture
+    def perm_index(self, tmp_path) -> tantivy.Index:
+        # Use a temporary directory instead of in-memory index to avoid tokenizer issues
+        schema = build_schema()
+        idx = tantivy.Index(schema, path=str(tmp_path))
+        register_tokenizers(idx, "en")
+        return idx
+
+    def _add_doc(
+        self,
+        idx: tantivy.Index,
+        doc_id: int,
+        owner_id: int | None = None,
+        viewer_ids: tuple[int, ...] = (),
+    ) -> None:
+        writer = idx.writer()
+        doc = tantivy.Document()
+        doc.add_unsigned("id", doc_id)
+        # Only add owner_id field if the document has an owner
+        if owner_id is not None:
+            doc.add_unsigned("owner_id", owner_id)
+        for vid in viewer_ids:
+            doc.add_unsigned("viewer_id", vid)
+        writer.add_document(doc)
+        writer.commit()
+        idx.reload()
+
+    def test_perm_no_owner_visible_to_any_user(self, perm_index: tantivy.Index) -> None:
+        self._add_doc(perm_index, doc_id=1, owner_id=None)
+        user = type("U", (), {"pk": 99})()
+        perm = build_permission_filter(perm_index.schema, user)  # .schema is a property
+        assert perm_index.searcher().search(perm, limit=10).count == 1
+
+    def test_perm_owned_by_user_is_visible(self, perm_index: tantivy.Index) -> None:
+        self._add_doc(perm_index, doc_id=2, owner_id=42)
+        user = type("U", (), {"pk": 42})()
+        perm = build_permission_filter(perm_index.schema, user)
+        assert perm_index.searcher().search(perm, limit=10).count == 1
+
+    def test_perm_owned_by_other_not_visible(self, perm_index: tantivy.Index) -> None:
+        self._add_doc(perm_index, doc_id=3, owner_id=42)
+        user = type("U", (), {"pk": 99})()
+        perm = build_permission_filter(perm_index.schema, user)
+        assert perm_index.searcher().search(perm, limit=10).count == 0
+
+    def test_perm_shared_viewer_is_visible(self, perm_index: tantivy.Index) -> None:
+        self._add_doc(perm_index, doc_id=4, owner_id=42, viewer_ids=(99,))
+        user = type("U", (), {"pk": 99})()
+        perm = build_permission_filter(perm_index.schema, user)
+        assert perm_index.searcher().search(perm, limit=10).count == 1
+
+    def test_perm_only_owned_docs_hidden_from_others(
+        self,
+        perm_index: tantivy.Index,
+    ) -> None:
+        self._add_doc(perm_index, doc_id=5, owner_id=10)  # owned by 10
+        self._add_doc(perm_index, doc_id=6, owner_id=None)  # unowned
+        user = type("U", (), {"pk": 20})()
+        perm = build_permission_filter(perm_index.schema, user)
+        assert perm_index.searcher().search(perm, limit=10).count == 1  # only unowned