Using a draft release of tanvity-py, fixes up all the TODO locations with the new API calls

2026-06-24 22:34:22 +00:00 · 2026-04-19 13:14:17 -07:00
parent 0aa8c149bc
commit d5ef58bfd4
5 changed files with 44 additions and 136 deletions
@@ -222,24 +222,9 @@ class WriteBatch:
        self._writer.add_document(doc)

    def remove(self, doc_id: int) -> None:
-        """
-        Remove a document from the batch by its primary key.
-
-        Uses range_query instead of term_query to work around a tantivy-py bug
-        where Python integers are inferred as i64, producing Terms that never
-        match u64 fields.
-
-        TODO: Replace with term_query("id", doc_id) once
-        https://github.com/quickwit-oss/tantivy-py/pull/642 lands.
-        """
+        """Remove a document from the batch by its primary key."""
        self._writer.delete_documents_by_query(
-            tantivy.Query.range_query(
-                self._backend._schema,
-                "id",
-                tantivy.FieldType.Unsigned,
-                doc_id,
-                doc_id,
-            ),
+            tantivy.Query.term_query(self._backend._schema, "id", doc_id),
        )


@@ -526,15 +511,6 @@ class TantivyBackend:
        Use this when you already know which documents to display (from
        search_ids + ORM filtering) and just need highlight data.

-        Note: Each doc_id requires an individual index lookup because tantivy-py
-        does not yet expose a batch fast-field read API. This is acceptable for
-        page-sized batches (typically 25 docs) but should not be called with
-        thousands of IDs.
-
-        TODO: When https://github.com/quickwit-oss/tantivy-py/pull/641 lands,
-        the per-doc range_query lookups here can be replaced with a single
-        collect_u64_fast_field("id", doc_addresses) call.
-
        Args:
            query: The search query (used for snippet generation)
            doc_ids: Ordered list of document IDs to generate hits for
@@ -571,32 +547,42 @@ class TantivyBackend:
            notes_text_query = user_query

        searcher = self._index.searcher()
+
+        # Fetch all requested docs in a single search: user_query MUST match
+        # and exactly the requested IDs MUST match (OR of term_queries).
+        id_filter = tantivy.Query.boolean_query(
+            [
+                (
+                    tantivy.Occur.Should,
+                    tantivy.Query.term_query(self._schema, "id", did),
+                )
+                for did in doc_ids
+            ],
+        )
+        batch_query = tantivy.Query.boolean_query(
+            [
+                (tantivy.Occur.Must, user_query),
+                (tantivy.Occur.Must, id_filter),
+            ],
+        )
+        batch_results = searcher.search(batch_query, limit=len(doc_ids))
+
+        result_addrs = [addr for _score, addr in batch_results.hits]
+        result_ids = searcher.fast_field_values("id", result_addrs)
+        addr_by_id: dict[int, tuple[float, tantivy.DocAddress]] = {
+            doc_id: (score, addr)
+            for (score, addr), doc_id in zip(batch_results.hits, result_ids)
+        }
+
        snippet_generator = None
        notes_snippet_generator = None
        hits: list[SearchHit] = []

        for rank, doc_id in enumerate(doc_ids, start=rank_start):
-            # Look up document by ID, scoring against the user query so that
-            # the returned SearchHit carries a real BM25 relevance score.
-            id_query = tantivy.Query.range_query(
-                self._schema,
-                "id",
-                tantivy.FieldType.Unsigned,
-                doc_id,
-                doc_id,
-            )
-            scored_query = tantivy.Query.boolean_query(
-                [
-                    (tantivy.Occur.Must, user_query),
-                    (tantivy.Occur.Must, id_query),
-                ],
-            )
-            results = searcher.search(scored_query, limit=1)
-
-            if not results.hits:
+            if doc_id not in addr_by_id:
                continue

-            score, doc_address = results.hits[0]
+            score, doc_address = addr_by_id[doc_id]
            actual_doc = searcher.doc(doc_address)
            doc_dict = actual_doc.to_dict()

@@ -701,10 +687,7 @@ class TantivyBackend:
            if threshold is not None:
                all_hits = [hit for hit in all_hits if hit[1] >= threshold]

-        # TODO: Replace with searcher.collect_u64_fast_field("id", addrs) once
-        # https://github.com/quickwit-oss/tantivy-py/pull/641 lands — eliminates
-        # one stored-doc fetch per result (~80% reduction in search_ids latency).
-        return [searcher.doc(doc_addr).to_dict()["id"][0] for doc_addr, *_ in all_hits]
+        return searcher.fast_field_values("id", [doc_addr for doc_addr, *_ in all_hits])

    def autocomplete(
        self,
@@ -821,13 +804,7 @@ class TantivyBackend:
        self._ensure_open()
        searcher = self._index.searcher()

-        id_query = tantivy.Query.range_query(
-            self._schema,
-            "id",
-            tantivy.FieldType.Unsigned,
-            doc_id,
-            doc_id,
-        )
+        id_query = tantivy.Query.term_query(self._schema, "id", doc_id)
        results = searcher.search(id_query, limit=1)

        if not results.hits:
@@ -851,14 +828,9 @@ class TantivyBackend:
        # Fetch one extra to account for excluding the original document
        results = searcher.search(final_query, limit=effective_limit + 1)

-        # TODO: Replace with collect_u64_fast_field("id", addrs) once
-        # https://github.com/quickwit-oss/tantivy-py/pull/641 lands.
-        ids = []
-        for _score, doc_address in results.hits:
-            result_doc_id = searcher.doc(doc_address).to_dict()["id"][0]
-            if result_doc_id != doc_id:
-                ids.append(result_doc_id)
-
+        addrs = [addr for _score, addr in results.hits]
+        all_ids = searcher.fast_field_values("id", addrs)
+        ids = [rid for rid in all_ids if rid != doc_id]
        return ids[:limit] if limit is not None else ids

    def batch_update(self, lock_timeout: float = 30.0) -> WriteBatch:
@@ -410,9 +410,6 @@ def normalize_query(query: str) -> str:
        raise ValueError("Query too complex to process (normalization timed out)")


-_MAX_U64 = 2**64 - 1  # u64 max — used as inclusive upper bound for "any owner" range
-
-
 def build_permission_filter(
    schema: tantivy.Schema,
    user: AbstractBaseUser,
@@ -432,48 +429,16 @@ def build_permission_filter(

    Returns:
        Tantivy query that filters results to visible documents
-
-    Implementation Notes:
-        - Uses range_query instead of term_query for owner_id/viewer_id to work
-          around a tantivy-py bug where Python ints are inferred as i64, causing
-          term_query to return no hits on u64 fields.
-          TODO: Replace with term_query once
-          https://github.com/quickwit-oss/tantivy-py/pull/642 lands.
-
-        - Uses range_query(owner_id, 1, MAX_U64) as an "owner exists" check
-          because exists_query is not yet available in tantivy-py 0.25.
-          TODO: Replace with exists_query("owner_id") once that is exposed in
-          a tantivy-py release.
-
-        - Uses disjunction_max_query to combine permission clauses with OR logic
    """
-    owner_any = tantivy.Query.range_query(
-        schema,
-        "owner_id",
-        tantivy.FieldType.Unsigned,
-        1,
-        _MAX_U64,
-    )
+    owner_any = tantivy.Query.exists_query("owner_id")
    no_owner = tantivy.Query.boolean_query(
        [
            (tantivy.Occur.Must, tantivy.Query.all_query()),
            (tantivy.Occur.MustNot, owner_any),
        ],
    )
-    owned = tantivy.Query.range_query(
-        schema,
-        "owner_id",
-        tantivy.FieldType.Unsigned,
-        user.pk,
-        user.pk,
-    )
-    shared = tantivy.Query.range_query(
-        schema,
-        "viewer_id",
-        tantivy.FieldType.Unsigned,
-        user.pk,
-        user.pk,
-    )
+    owned = tantivy.Query.term_query(schema, "owner_id", user.pk)
+    shared = tantivy.Query.term_query(schema, "viewer_id", user.pk)
    return tantivy.Query.disjunction_max_query([no_owner, owned, shared])


@@ -24,13 +24,7 @@ class TestDocumentAdmin(DirectoriesMixin, TestCase):
        backend = get_backend()
        searcher = backend._index.searcher()
        results = searcher.search(
-            tantivy.Query.range_query(
-                backend._schema,
-                "id",
-                tantivy.FieldType.Unsigned,
-                doc.pk,
-                doc.pk,
-            ),
+            tantivy.Query.term_query(backend._schema, "id", doc.pk),
            limit=1,
        )
        if results.hits: