Using a draft release of tanvity-py, fixes up all the TODO locations with the new API calls

This commit is contained in:
Trenton Holmes
2026-04-19 13:14:17 -07:00
parent 0aa8c149bc
commit d5ef58bfd4
5 changed files with 44 additions and 136 deletions
+36 -64
View File
@@ -222,24 +222,9 @@ class WriteBatch:
self._writer.add_document(doc)
def remove(self, doc_id: int) -> None:
"""
Remove a document from the batch by its primary key.
Uses range_query instead of term_query to work around a tantivy-py bug
where Python integers are inferred as i64, producing Terms that never
match u64 fields.
TODO: Replace with term_query("id", doc_id) once
https://github.com/quickwit-oss/tantivy-py/pull/642 lands.
"""
"""Remove a document from the batch by its primary key."""
self._writer.delete_documents_by_query(
tantivy.Query.range_query(
self._backend._schema,
"id",
tantivy.FieldType.Unsigned,
doc_id,
doc_id,
),
tantivy.Query.term_query(self._backend._schema, "id", doc_id),
)
@@ -526,15 +511,6 @@ class TantivyBackend:
Use this when you already know which documents to display (from
search_ids + ORM filtering) and just need highlight data.
Note: Each doc_id requires an individual index lookup because tantivy-py
does not yet expose a batch fast-field read API. This is acceptable for
page-sized batches (typically 25 docs) but should not be called with
thousands of IDs.
TODO: When https://github.com/quickwit-oss/tantivy-py/pull/641 lands,
the per-doc range_query lookups here can be replaced with a single
collect_u64_fast_field("id", doc_addresses) call.
Args:
query: The search query (used for snippet generation)
doc_ids: Ordered list of document IDs to generate hits for
@@ -571,32 +547,42 @@ class TantivyBackend:
notes_text_query = user_query
searcher = self._index.searcher()
# Fetch all requested docs in a single search: user_query MUST match
# and exactly the requested IDs MUST match (OR of term_queries).
id_filter = tantivy.Query.boolean_query(
[
(
tantivy.Occur.Should,
tantivy.Query.term_query(self._schema, "id", did),
)
for did in doc_ids
],
)
batch_query = tantivy.Query.boolean_query(
[
(tantivy.Occur.Must, user_query),
(tantivy.Occur.Must, id_filter),
],
)
batch_results = searcher.search(batch_query, limit=len(doc_ids))
result_addrs = [addr for _score, addr in batch_results.hits]
result_ids = searcher.fast_field_values("id", result_addrs)
addr_by_id: dict[int, tuple[float, tantivy.DocAddress]] = {
doc_id: (score, addr)
for (score, addr), doc_id in zip(batch_results.hits, result_ids)
}
snippet_generator = None
notes_snippet_generator = None
hits: list[SearchHit] = []
for rank, doc_id in enumerate(doc_ids, start=rank_start):
# Look up document by ID, scoring against the user query so that
# the returned SearchHit carries a real BM25 relevance score.
id_query = tantivy.Query.range_query(
self._schema,
"id",
tantivy.FieldType.Unsigned,
doc_id,
doc_id,
)
scored_query = tantivy.Query.boolean_query(
[
(tantivy.Occur.Must, user_query),
(tantivy.Occur.Must, id_query),
],
)
results = searcher.search(scored_query, limit=1)
if not results.hits:
if doc_id not in addr_by_id:
continue
score, doc_address = results.hits[0]
score, doc_address = addr_by_id[doc_id]
actual_doc = searcher.doc(doc_address)
doc_dict = actual_doc.to_dict()
@@ -701,10 +687,7 @@ class TantivyBackend:
if threshold is not None:
all_hits = [hit for hit in all_hits if hit[1] >= threshold]
# TODO: Replace with searcher.collect_u64_fast_field("id", addrs) once
# https://github.com/quickwit-oss/tantivy-py/pull/641 lands — eliminates
# one stored-doc fetch per result (~80% reduction in search_ids latency).
return [searcher.doc(doc_addr).to_dict()["id"][0] for doc_addr, *_ in all_hits]
return searcher.fast_field_values("id", [doc_addr for doc_addr, *_ in all_hits])
def autocomplete(
self,
@@ -821,13 +804,7 @@ class TantivyBackend:
self._ensure_open()
searcher = self._index.searcher()
id_query = tantivy.Query.range_query(
self._schema,
"id",
tantivy.FieldType.Unsigned,
doc_id,
doc_id,
)
id_query = tantivy.Query.term_query(self._schema, "id", doc_id)
results = searcher.search(id_query, limit=1)
if not results.hits:
@@ -851,14 +828,9 @@ class TantivyBackend:
# Fetch one extra to account for excluding the original document
results = searcher.search(final_query, limit=effective_limit + 1)
# TODO: Replace with collect_u64_fast_field("id", addrs) once
# https://github.com/quickwit-oss/tantivy-py/pull/641 lands.
ids = []
for _score, doc_address in results.hits:
result_doc_id = searcher.doc(doc_address).to_dict()["id"][0]
if result_doc_id != doc_id:
ids.append(result_doc_id)
addrs = [addr for _score, addr in results.hits]
all_ids = searcher.fast_field_values("id", addrs)
ids = [rid for rid in all_ids if rid != doc_id]
return ids[:limit] if limit is not None else ids
def batch_update(self, lock_timeout: float = 30.0) -> WriteBatch:
+3 -38
View File
@@ -410,9 +410,6 @@ def normalize_query(query: str) -> str:
raise ValueError("Query too complex to process (normalization timed out)")
_MAX_U64 = 2**64 - 1 # u64 max — used as inclusive upper bound for "any owner" range
def build_permission_filter(
schema: tantivy.Schema,
user: AbstractBaseUser,
@@ -432,48 +429,16 @@ def build_permission_filter(
Returns:
Tantivy query that filters results to visible documents
Implementation Notes:
- Uses range_query instead of term_query for owner_id/viewer_id to work
around a tantivy-py bug where Python ints are inferred as i64, causing
term_query to return no hits on u64 fields.
TODO: Replace with term_query once
https://github.com/quickwit-oss/tantivy-py/pull/642 lands.
- Uses range_query(owner_id, 1, MAX_U64) as an "owner exists" check
because exists_query is not yet available in tantivy-py 0.25.
TODO: Replace with exists_query("owner_id") once that is exposed in
a tantivy-py release.
- Uses disjunction_max_query to combine permission clauses with OR logic
"""
owner_any = tantivy.Query.range_query(
schema,
"owner_id",
tantivy.FieldType.Unsigned,
1,
_MAX_U64,
)
owner_any = tantivy.Query.exists_query("owner_id")
no_owner = tantivy.Query.boolean_query(
[
(tantivy.Occur.Must, tantivy.Query.all_query()),
(tantivy.Occur.MustNot, owner_any),
],
)
owned = tantivy.Query.range_query(
schema,
"owner_id",
tantivy.FieldType.Unsigned,
user.pk,
user.pk,
)
shared = tantivy.Query.range_query(
schema,
"viewer_id",
tantivy.FieldType.Unsigned,
user.pk,
user.pk,
)
owned = tantivy.Query.term_query(schema, "owner_id", user.pk)
shared = tantivy.Query.term_query(schema, "viewer_id", user.pk)
return tantivy.Query.disjunction_max_query([no_owner, owned, shared])
+1 -7
View File
@@ -24,13 +24,7 @@ class TestDocumentAdmin(DirectoriesMixin, TestCase):
backend = get_backend()
searcher = backend._index.searcher()
results = searcher.search(
tantivy.Query.range_query(
backend._schema,
"id",
tantivy.FieldType.Unsigned,
doc.pk,
doc.pk,
),
tantivy.Query.term_query(backend._schema, "id", doc.pk),
limit=1,
)
if results.hits: