From 759717404e294aeb5b38475855f7274236779543 Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Tue, 7 Apr 2026 14:45:16 -0700 Subject: [PATCH] Adds notes for where we can improve, if either fixes, features or a new release drop in from Tantivy --- src/documents/search/_backend.py | 20 ++++++++++++++++---- src/documents/search/_query.py | 15 +++++++++++---- 2 files changed, 27 insertions(+), 8 deletions(-) diff --git a/src/documents/search/_backend.py b/src/documents/search/_backend.py index 476315d57..652cc2c3e 100644 --- a/src/documents/search/_backend.py +++ b/src/documents/search/_backend.py @@ -224,10 +224,13 @@ class WriteBatch: """ Remove a document from the batch by its primary key. - Uses range query instead of term query to work around unsigned integer - type detection bug in tantivy-py 0.25. + Uses range_query instead of term_query to work around a tantivy-py bug + where Python integers are inferred as i64, producing Terms that never + match u64 fields. + + TODO: Replace with term_query("id", doc_id) once + https://github.com/quickwit-oss/tantivy-py/pull/642 lands. """ - # Use range query to work around u64 deletion bug self._writer.delete_documents_by_query( tantivy.Query.range_query( self._backend._schema, @@ -518,10 +521,14 @@ class TantivyBackend: search_ids + ORM filtering) and just need highlight data. Note: Each doc_id requires an individual index lookup because tantivy-py - does not expose a batch doc-address-by-ID API. This is acceptable for + does not yet expose a batch fast-field read API. This is acceptable for page-sized batches (typically 25 docs) but should not be called with thousands of IDs. + TODO: When https://github.com/quickwit-oss/tantivy-py/pull/641 lands, + the per-doc range_query lookups here can be replaced with a single + collect_u64_fast_field("id", doc_addresses) call. + Args: query: The search query (used for snippet generation) doc_ids: Ordered list of document IDs to generate hits for @@ -665,6 +672,9 @@ class TantivyBackend: if threshold is not None: all_hits = [hit for hit in all_hits if hit[1] >= threshold] + # TODO: Replace with searcher.collect_u64_fast_field("id", addrs) once + # https://github.com/quickwit-oss/tantivy-py/pull/641 lands — eliminates + # one stored-doc fetch per result (~80% reduction in search_ids latency). return [searcher.doc(doc_addr).to_dict()["id"][0] for doc_addr, *_ in all_hits] def autocomplete( @@ -812,6 +822,8 @@ class TantivyBackend: # Fetch one extra to account for excluding the original document results = searcher.search(final_query, limit=effective_limit + 1) + # TODO: Replace with collect_u64_fast_field("id", addrs) once + # https://github.com/quickwit-oss/tantivy-py/pull/641 lands. ids = [] for _score, doc_address in results.hits: result_doc_id = searcher.doc(doc_address).to_dict()["id"][0] diff --git a/src/documents/search/_query.py b/src/documents/search/_query.py index b7bcbbe9c..ed0bb4c15 100644 --- a/src/documents/search/_query.py +++ b/src/documents/search/_query.py @@ -396,10 +396,17 @@ def build_permission_filter( Tantivy query that filters results to visible documents Implementation Notes: - - Uses range_query instead of term_query to work around unsigned integer - type detection bug in tantivy-py 0.25 - - Uses boolean_query for "no owner" check since exists_query is not - available in tantivy-py 0.25.1 (available in master) + - Uses range_query instead of term_query for owner_id/viewer_id to work + around a tantivy-py bug where Python ints are inferred as i64, causing + term_query to return no hits on u64 fields. + TODO: Replace with term_query once + https://github.com/quickwit-oss/tantivy-py/pull/642 lands. + + - Uses range_query(owner_id, 1, MAX_U64) as an "owner exists" check + because exists_query is not yet available in tantivy-py 0.25. + TODO: Replace with exists_query("owner_id") once that is exposed in + a tantivy-py release. + - Uses disjunction_max_query to combine permission clauses with OR logic """ owner_any = tantivy.Query.range_query(